<a href="https://colab.research.google.com/github/karenbennis/Xy/blob/ml_model/ML_Model_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Connect to Database**

In [1]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.4.6/spark-2.4.6-bin-hadoop2.7.tgz
!tar xf spark-2.4.6-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.6-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

#Interact with SQL
!wget https://jdbc.postgresql.org/download/postgresql-42.2.9.jar

# Start Spark Session(Creating spark application with name defined by appName()) ---IMPORTED WITH EVERY COLAB NOTEBOOK
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("database_transformation").config("spark.driver.extraClassPath","/content/postgresql-42.2.9.jar").getOrCreate()


--2020-07-21 17:52:15--  https://jdbc.postgresql.org/download/postgresql-42.2.9.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 914037 (893K) [application/java-archive]
Saving to: ‘postgresql-42.2.9.jar’


2020-07-21 17:52:15 (4.71 MB/s) - ‘postgresql-42.2.9.jar’ saved [914037/914037]



In [2]:
# Authenticate user
from google.colab import auth
auth.authenticate_user()

In [3]:
# Set project id
project_id = 'xy-yelp'

In [4]:
# Set project
!gcloud config set project {project_id}

Updated property [core/project].


In [5]:
# Get file which was saved to bucket **** file saves as multiple files so this should be addressed in the future, for now the 11mb file should be fine
!gsutil cp -r gs://xy-bucket/json_files/ml_j.json/part-00000-f8a8f21c-0ebe-434c-a8b3-a5c8988dd298-c000.json /tmp/machine_json.json

Copying gs://xy-bucket/json_files/ml_j.json/part-00000-f8a8f21c-0ebe-434c-a8b3-a5c8988dd298-c000.json...
- [1 files][ 62.5 MiB/ 62.5 MiB]                                                
Operation completed over 1 objects/62.5 MiB.                                     


In [6]:
# Set filepath to tmp folder where bucket file was saved **** this could actually be done without saving to the bucket, this is just done to make sure we can 
# save to and read from bucket if we need to
filepath = '/tmp/machine_json.json'

In [7]:
# Import pandas and read json file into DataFrame
import pandas as pd
raw_json = pd.read_json(filepath, lines=True)

In [8]:
#Drop useless columns
raw_json=raw_json[['review_id','stars','stars_one_hot','features']]

In [9]:
# Show dataframe
raw_json.head()

Unnamed: 0,review_id,stars,stars_one_hot,features
0,-7yxrdY13ay15rGB7WibMA,5,"{'type': 0, 'size': 5, 'indices': [1], 'values...","{'type': 0, 'size': 262145, 'indices': [9521, ..."
1,-Be0UUGYuiDJVAM_YqeQuA,4,"{'type': 0, 'size': 5, 'indices': [0], 'values...","{'type': 0, 'size': 262145, 'indices': [78, 31..."
2,-nQHHXi-d_yuW301_Y0EZQ,2,"{'type': 0, 'size': 5, 'indices': [3], 'values...","{'type': 0, 'size': 262145, 'indices': [1846, ..."
3,2L30O7G8IQ6HILpR0t5RFA,5,"{'type': 0, 'size': 5, 'indices': [1], 'values...","{'type': 0, 'size': 262145, 'indices': [8804, ..."
4,4x5yLG7_yGLuN-w6fV0eBw,4,"{'type': 0, 'size': 5, 'indices': [0], 'values...","{'type': 0, 'size': 262145, 'indices': [17141,..."


# **Preprocessing**

In [10]:
# View full contents of cell to understand how json data was read into DataFrame
y=raw_json.iat[0,-1]
print(y)

{'type': 0, 'size': 262145, 'indices': [9521, 13381, 24113, 34146, 47205, 48870, 52657, 55639, 69793, 72944, 74473, 78329, 79660, 80245, 82321, 82582, 87910, 89717, 95454, 95502, 101376, 107367, 109230, 113458, 113462, 115157, 117481, 118144, 120391, 125372, 128924, 130707, 132270, 132538, 134691, 137431, 138356, 141407, 154186, 157120, 168385, 181758, 184251, 188828, 192137, 195155, 208258, 216432, 218117, 221790, 222394, 229772, 232685, 235700, 236821, 240976, 245044, 262144], 'values': [13.146379889022644, 4.547001272864449, 1.6436118877681851, 12.331835858506187, 10.720585530532915, 4.213228093212401, 6.90785527398247, 2.310717259691643, 8.111828078308406, 7.824146005856625, 3.139702638974026, 2.464028238403142, 7.418680897748461, 4.625472888305944, 4.213228093212401, 13.004780331748613, 6.725533717188516, 2.067612965814895, 8.517293186416572, 8.517293186416572, 3.973998404146567, 2.415853786100017, 10.014967732503301, 2.852598100468416, 5.083305981931424, 3.5302677579594492, 3.249

In [11]:
# Separate dictionary values keys into columns keeping the values and column values
features_series = raw_json.features.apply(pd.Series)

In [12]:
# Drop type as it was an extra json value added during conversion
features_series = features_series.drop('type', axis=1)

In [13]:
# Check DataFrame structure
features_series.head(3)

Unnamed: 0,size,indices,values
0,262145,"[9521, 13381, 24113, 34146, 47205, 48870, 5265...","[13.146379889022644, 4.547001272864449, 1.6436..."
1,262145,"[78, 3188, 4200, 4821, 5381, 5947, 8391, 8527,...","[3.9846936932633152, 8.111828078308406, 2.9151..."
2,262145,"[1846, 4106, 7917, 8287, 8630, 8769, 13677, 15...","[11.89024888444809, 6.812545094178145, 4.77962..."


In [14]:
# Initialize target dataframe
new_df = features_series.copy()

In [15]:
# Combine size, indices, and values into tuple containing mutil-dimensional array to match original format
new_df['features'] = list(zip(features_series['indices'], features_series['values']))
new_df=new_df.rename(columns={"indices": "feature_indices", "values": "feature_values"})
new_df=new_df.drop('size',axis=1)

In [16]:
# Inspect DataFrame
new_df.head()

Unnamed: 0,feature_indices,feature_values,features
0,"[9521, 13381, 24113, 34146, 47205, 48870, 5265...","[13.146379889022644, 4.547001272864449, 1.6436...","([9521, 13381, 24113, 34146, 47205, 48870, 526..."
1,"[78, 3188, 4200, 4821, 5381, 5947, 8391, 8527,...","[3.9846936932633152, 8.111828078308406, 2.9151...","([78, 3188, 4200, 4821, 5381, 5947, 8391, 8527..."
2,"[1846, 4106, 7917, 8287, 8630, 8769, 13677, 15...","[11.89024888444809, 6.812545094178145, 4.77962...","([1846, 4106, 7917, 8287, 8630, 8769, 13677, 1..."
3,"[8804, 19862, 30006, 40337, 46639, 47032, 5099...","[6.593874722676491, 5.036053097080879, 2.57975...","([8804, 19862, 30006, 40337, 46639, 47032, 509..."
4,"[17141, 24145, 35715, 48549, 54961, 61231, 755...","[5.259196648395089, 3.550958151216895, 4.47424...","([17141, 24145, 35715, 48549, 54961, 61231, 75..."


In [17]:
# Combine features and labels to DataFrame to be used in machine learning model
machine_df = pd.concat([ raw_json[['review_id','stars']], new_df], axis=1)
machine_df.head()

Unnamed: 0,review_id,stars,feature_indices,feature_values,features
0,-7yxrdY13ay15rGB7WibMA,5,"[9521, 13381, 24113, 34146, 47205, 48870, 5265...","[13.146379889022644, 4.547001272864449, 1.6436...","([9521, 13381, 24113, 34146, 47205, 48870, 526..."
1,-Be0UUGYuiDJVAM_YqeQuA,4,"[78, 3188, 4200, 4821, 5381, 5947, 8391, 8527,...","[3.9846936932633152, 8.111828078308406, 2.9151...","([78, 3188, 4200, 4821, 5381, 5947, 8391, 8527..."
2,-nQHHXi-d_yuW301_Y0EZQ,2,"[1846, 4106, 7917, 8287, 8630, 8769, 13677, 15...","[11.89024888444809, 6.812545094178145, 4.77962...","([1846, 4106, 7917, 8287, 8630, 8769, 13677, 1..."
3,2L30O7G8IQ6HILpR0t5RFA,5,"[8804, 19862, 30006, 40337, 46639, 47032, 5099...","[6.593874722676491, 5.036053097080879, 2.57975...","([8804, 19862, 30006, 40337, 46639, 47032, 509..."
4,4x5yLG7_yGLuN-w6fV0eBw,4,"[17141, 24145, 35715, 48549, 54961, 61231, 755...","[5.259196648395089, 3.550958151216895, 4.47424...","([17141, 24145, 35715, 48549, 54961, 61231, 75..."


In [18]:
new_machine_df=machine_df[['review_id', 'stars', 'feature_indices','feature_values']]
new_machine_df.head()

Unnamed: 0,review_id,stars,feature_indices,feature_values
0,-7yxrdY13ay15rGB7WibMA,5,"[9521, 13381, 24113, 34146, 47205, 48870, 5265...","[13.146379889022644, 4.547001272864449, 1.6436..."
1,-Be0UUGYuiDJVAM_YqeQuA,4,"[78, 3188, 4200, 4821, 5381, 5947, 8391, 8527,...","[3.9846936932633152, 8.111828078308406, 2.9151..."
2,-nQHHXi-d_yuW301_Y0EZQ,2,"[1846, 4106, 7917, 8287, 8630, 8769, 13677, 15...","[11.89024888444809, 6.812545094178145, 4.77962..."
3,2L30O7G8IQ6HILpR0t5RFA,5,"[8804, 19862, 30006, 40337, 46639, 47032, 5099...","[6.593874722676491, 5.036053097080879, 2.57975..."
4,4x5yLG7_yGLuN-w6fV0eBw,4,"[17141, 24145, 35715, 48549, 54961, 61231, 755...","[5.259196648395089, 3.550958151216895, 4.47424..."


In [19]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [20]:
import numpy as np
y=explode(new_machine_df, lst_cols=['feature_indices','feature_values'])
y

Unnamed: 0,review_id,stars,feature_indices,feature_values
0,-7yxrdY13ay15rGB7WibMA,5,9521,13.146380
1,-7yxrdY13ay15rGB7WibMA,5,13381,4.547001
2,-7yxrdY13ay15rGB7WibMA,5,24113,1.643612
3,-7yxrdY13ay15rGB7WibMA,5,34146,12.331836
4,-7yxrdY13ay15rGB7WibMA,5,47205,10.720586
...,...,...,...,...
626191,yj_LYhizV601mC2Y9ql55A,5,245086,3.396310
626192,yj_LYhizV601mC2Y9ql55A,5,249180,0.309755
626193,yj_LYhizV601mC2Y9ql55A,5,251823,4.840993
626194,yj_LYhizV601mC2Y9ql55A,5,256790,4.866635


In [21]:
new_machine_df=new_machine_df.set_index('review_id')
y=y.set_index('review_id')
ml_df=y.pivot(columns='feature_indices',values='feature_values')

ml_df = ml_df.fillna(0)
ml_df.head()

feature_indices,5,6,8,10,13,14,15,43,47,52,61,67,68,70,78,90,107,116,123,125,132,133,145,150,168,170,181,182,191,193,194,200,204,205,211,216,223,227,228,239,...,261974,261975,261983,261986,261987,261995,261996,262001,262014,262028,262029,262035,262040,262042,262047,262048,262051,262054,262055,262067,262071,262072,262075,262077,262084,262091,262096,262099,262101,262105,262110,262113,262117,262120,262122,262125,262136,262137,262142,262144
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
--8hpKO4bkweLyEucdxI_w,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,527.0
--DazeDpOApabZnIOIPSrQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1131.0
--QCUEmDBlipC_CEutGVFA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,799.0
--TqfVGywiYWh7Sn9ksBmA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0
-04V-J248jjwibqilfLePA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,246.0


In [22]:
# Sample only 1000 rows of df for model
sample_df = ml_df.sample(n=1000, random_state=21)
sample_df.head()

feature_indices,5,6,8,10,13,14,15,43,47,52,61,67,68,70,78,90,107,116,123,125,132,133,145,150,168,170,181,182,191,193,194,200,204,205,211,216,223,227,228,239,...,261974,261975,261983,261986,261987,261995,261996,262001,262014,262028,262029,262035,262040,262042,262047,262048,262051,262054,262055,262067,262071,262072,262075,262077,262084,262091,262096,262099,262101,262105,262110,262113,262117,262120,262122,262125,262136,262137,262142,262144
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
M6bpGGLsCKfNaiaTjaUiVA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,742.0
AN7wQPa5ZHx-I7SgcJy6fw,0.0,0.0,0.0,0.0,0.0,1.478948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,279.0
emJl9kiTGRnNbAt0z04DAQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752.0
uw6eyCTL2KY3u_ju7E69Aw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,562.0
_pmYOW52o5wenplbAcu5wA,0.0,0.0,0.0,0.0,0.0,2.957897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,969.0


In [23]:
new_machine_df.head()

Unnamed: 0_level_0,stars,feature_indices,feature_values
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-7yxrdY13ay15rGB7WibMA,5,"[9521, 13381, 24113, 34146, 47205, 48870, 5265...","[13.146379889022644, 4.547001272864449, 1.6436..."
-Be0UUGYuiDJVAM_YqeQuA,4,"[78, 3188, 4200, 4821, 5381, 5947, 8391, 8527,...","[3.9846936932633152, 8.111828078308406, 2.9151..."
-nQHHXi-d_yuW301_Y0EZQ,2,"[1846, 4106, 7917, 8287, 8630, 8769, 13677, 15...","[11.89024888444809, 6.812545094178145, 4.77962..."
2L30O7G8IQ6HILpR0t5RFA,5,"[8804, 19862, 30006, 40337, 46639, 47032, 5099...","[6.593874722676491, 5.036053097080879, 2.57975..."
4x5yLG7_yGLuN-w6fV0eBw,4,"[17141, 24145, 35715, 48549, 54961, 61231, 755...","[5.259196648395089, 3.550958151216895, 4.47424..."


In [27]:
new_sample_df = pd.merge(sample_df, new_machine_df['stars'], left_index= True, right_index=True)
new_sample_df.head()

Unnamed: 0_level_0,5,6,8,10,13,14,15,43,47,52,61,67,68,70,78,90,107,116,123,125,132,133,145,150,168,170,181,182,191,193,194,200,204,205,211,216,223,227,228,239,...,261975,261983,261986,261987,261995,261996,262001,262014,262028,262029,262035,262040,262042,262047,262048,262051,262054,262055,262067,262071,262072,262075,262077,262084,262091,262096,262099,262101,262105,262110,262113,262117,262120,262122,262125,262136,262137,262142,262144,stars
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
M6bpGGLsCKfNaiaTjaUiVA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,742.0,3
AN7wQPa5ZHx-I7SgcJy6fw,0.0,0.0,0.0,0.0,0.0,1.478948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,279.0,5
emJl9kiTGRnNbAt0z04DAQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752.0,5
uw6eyCTL2KY3u_ju7E69Aw,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,562.0,4
_pmYOW52o5wenplbAcu5wA,0.0,0.0,0.0,0.0,0.0,2.957897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,969.0,4


# **Machine Learning**

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import tensorflow as tf

In [32]:
# Generate our categorical variable list

yelp_cat = ['stars']

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)
yelp_ml_df = pd.DataFrame(new_sample_df)
# do one-hotencoding for each column (independent variables)

for cat in yelp_cat:
    
    # Fit and transform the OneHotEncoder using the categorical variable list
    encode_df = pd.DataFrame(enc.fit_transform(new_sample_df[[cat]]))

    # Add the encoded variable names to the DataFrame
    encode_df.columns = enc.get_feature_names([cat])
    
    # merge all the encoded columns back one data frame
    yelp_ml_df = pd.concat([yelp_ml_df.reset_index(drop=True), encode_df.reset_index(drop=True)], axis=1)
    
# Drop original columns
yelp_ml_df = yelp_ml_df.drop(yelp_cat, 1)

# Set EIN as index
# charity_df_copy = charity_df_copy.set_index('EIN')

yelp_ml_df.head()



Unnamed: 0,5,6,8,10,13,14,15,43,47,52,61,67,68,70,78,90,107,116,123,125,132,133,145,150,168,170,181,182,191,193,194,200,204,205,211,216,223,227,228,239,...,261995,261996,262001,262014,262028,262029,262035,262040,262042,262047,262048,262051,262054,262055,262067,262071,262072,262075,262077,262084,262091,262096,262099,262101,262105,262110,262113,262117,262120,262122,262125,262136,262137,262142,262144,stars_1,stars_2,stars_3,stars_4,stars_5
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,742.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.478948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,279.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,752.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,562.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,2.957897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,969.0,0.0,0.0,0.0,1.0,0.0


In [33]:
X=yelp_ml_df.drop(['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5'], 1).values
y=yelp_ml_df[['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']].values
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)

In [34]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train,y_train)
# it doesnt like that x_train is a list

ValueError: ignored

In [45]:
mnb.predict(X_test)

array([5, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4,
       5, 5, 4, 5, 5, 4, 4, 4, 4, 5, 4, 5, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4,
       4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 5, 4, 5, 4, 4, 4,
       5, 4, 4, 4, 4, 5, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 5,
       4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, 5,
       4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5,
       4, 4, 4, 4, 4, 4, 3, 4, 5, 4, 4, 4, 4, 5, 4, 4, 4, 4, 5, 5, 5, 4,
       5, 4, 5, 4, 4, 4, 5, 4, 5, 4, 5, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4,
       5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 4, 5, 4, 4, 4, 4, 4, 5,
       4, 4])

In [51]:
mnb.score(X_test, y_test)

0.375

In [35]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [36]:
# Check number of input features
len(X_train_scaled[0])

62013

In [41]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  30
hidden_nodes_layer2 = 13

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=5, activation="softmax"))

# Check the structure of the model
nn.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 30)                1860420   
_________________________________________________________________
dense_4 (Dense)              (None, 13)                403       
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 70        
Total params: 1,860,893
Trainable params: 1,860,893
Non-trainable params: 0
_________________________________________________________________


In [42]:
# Compile the model
nn.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [43]:
# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [44]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

7/7 - 0s - loss: 3.1085 - accuracy: 0.3550
Loss: 3.108480215072632, Accuracy: 0.35499998927116394


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(reviews['text'])

In [None]:
dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(vectors)