<a href="https://colab.research.google.com/github/jmohsbeck1/jpmc_mle/blob/final_project/DataDazzlers_DineOracle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library Imports

In [None]:
import os
import warnings
import itertools
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
warnings.filterwarnings('ignore')

In [None]:
#Dataframe and numerical library
import pandas as pd 
import numpy as np

#Visualization
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
%matplotlib inline

#Machine Learming Model
#Metrics
from sklearn.metrics import mutual_info_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

#Model Selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

#Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

#Linear Model
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge

#Ensemble
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#Others
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

#Hyper parameter
from sklearn import neighbors, datasets, model_selection

# Data Import

In [None]:
#STEP 1: Import large dataset using opendatasets
if not os.path.exists('yelp-dataset'):
  print("Loading Keys")
  kaggle_key = 'ravikiranbutti'
  kaggle_value = '117268fa41345f39e5baeda66733a0c7'
  os.environ['KAGGLE_USERNAME'] = kaggle_key
  os.environ['KAGGLE_KEY'] = kaggle_value
  !mkdir -p /root/.kaggle
  with open('/root/.kaggle/kaggle.json', 'w') as kaggle_file:
    kaggle_file.write('{"username":"' + kaggle_key + '","key":"' + kaggle_value + '"}')

  print("Loading Data")
  !pip install kaggle
  !kaggle datasets download yelp-dataset/yelp-dataset
  !unzip yelp-dataset.zip -d 'yelp-dataset'
  !rm yelp-dataset.zip

Loading Keys
Loading Keys
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading yelp-dataset.zip to /content
100% 4.07G/4.07G [03:00<00:00, 24.4MB/s]
100% 4.07G/4.07G [03:00<00:00, 24.3MB/s]
Archive:  yelp-dataset.zip
  inflating: yelp-dataset/Dataset_User_Agreement.pdf  
  inflating: yelp-dataset/yelp_academic_dataset_business.json  
  inflating: yelp-dataset/yelp_academic_dataset_checkin.json  
  inflating: yelp-dataset/yelp_academic_dataset_review.json  
  inflating: yelp-dataset/yelp_academic_dataset_tip.json  
  inflating: yelp-dataset/yelp_academic_dataset_user.json  


In [None]:
#STEP 2: Load Business using chunk processing
business_chunks = pd.read_json("yelp-dataset/yelp_academic_dataset_business.json", lines=True, chunksize=10000)
business = pd.concat(business_chunks) 

In [None]:
#Print information
business.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB


In [None]:
#STEP 3: Load Review using chunk processing and filtering unwanted colums
review = []
review_dtypes = {"stars": np.float16, 
            "useful": np.int32, 
            "funny": np.int32,
            "cool": np.int32,
           }
with open("yelp-dataset/yelp_academic_dataset_review.json", "r") as f:
    review_chunks = pd.read_json(f, orient="records", lines=True, dtype=review_dtypes, chunksize=1000)
        
    for review_chunk in review_chunks:
        reduced_review_chunk = review_chunk.drop(columns=['review_id', 'useful','funny','cool', 'text','date'])
        review.append(reduced_review_chunk)
    
review = pd.concat(review, ignore_index=True)

In [None]:
#Print information
review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 3 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      object 
 1   business_id  object 
 2   stars        float16
dtypes: float16(1), object(2)
memory usage: 120.0+ MB


In [None]:
#STEP 4: Load User using chunk processing and filtering unwanted colums
user = []
with open("yelp-dataset/yelp_academic_dataset_user.json", "r") as f:
    user_chunks = pd.read_json(f, orient="records", lines=True, chunksize=1000)
        
    for user_chunk in user_chunks:
        reduced_user_chunk = user_chunk.drop(columns=['name', 'useful','funny','cool', 'elite','friends'])
        user.append(reduced_user_chunk)
    
user = pd.concat(user, ignore_index=True)

In [None]:
#Print information
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1987897 entries, 0 to 1987896
Data columns (total 16 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   review_count        int64  
 2   yelping_since       object 
 3   fans                int64  
 4   average_stars       float64
 5   compliment_hot      int64  
 6   compliment_more     int64  
 7   compliment_profile  int64  
 8   compliment_cute     int64  
 9   compliment_list     int64  
 10  compliment_note     int64  
 11  compliment_plain    int64  
 12  compliment_cool     int64  
 13  compliment_funny    int64  
 14  compliment_writer   int64  
 15  compliment_photos   int64  
dtypes: float64(1), int64(13), object(2)
memory usage: 242.7+ MB


# Data Cleanup