## Course Project Jupyter Notebook

#### Data Files 
- business.csv 
- sample_submission.csv
- test_queries.csv
- train_reviews.csv
- user.csv
- validate_queries.csv

In [None]:
import pandas as pd
import numpy as np

## Preprocessing Business Data

Expects the csv file to be in an "all" folder in the working directory of this notebook

In [None]:
business_df = pd.read_csv("all/business.csv", engine="python")
business_df_replace = business_df.copy()
# default value to replace for ambience when it is Nan
ambience_default = str({'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False})
business_df

### Feature Selection

I decided that we are only attempting to use features when more than half of its values are not Nan as otherwise, there are too less datapoints with a value. This percent non-Nan requirement can be changed nevertheless. 

I also decided to not look at the hours as it seems to complex to make into numerical value and would not help much in determining a user's review from intuition. The same is the case with the latitude, longitude, name, and address features. 

In [None]:
business_df_replace.drop([col for col in business_df.columns if business_df[col].isnull().sum() > 0.5 * 12058 ], axis=1, inplace=True)
business_df_replace.drop(['hours_Friday', 'hours_Monday', 'hours_Saturday',
       'hours_Sunday', 'hours_Thursday', 'hours_Tuesday', 'hours_Wednesday',
       'is_open', 'latitude', 'longitude', 'postal_code', 'name', 'address', 'categories'], axis=1, inplace=True)
business_df_replace

###  Helper Functions

##### view_column_values
Helps to view what values occur inside the column of a dataframe

##### expand_dict_to_columns
Sometimes there are columns in the dataframe in which the data is a dictionary string(such as attributes_Ambience). This function helps expand that dictionary string into extra columns with the column being the key and the row content being the value. It returns the modified dataframe.

##### replace_column_nan
There are many Nan in the data. This function replaces the Nan of a specifc column of a dataframe with one of the values that already occur. The third parameter index_of_value_count is used to specify what value to replace, the values which can be viewed using view_column_values


In [None]:
def view_column_values(df, column_name):
    return df[column_name].value_counts()

def expand_dict_to_columns(df, column_name):
    expanded_df = df[column_name].apply(lambda x : dict(eval(x))).apply(pd.Series)
    expanded_df.fillna(False, inplace=True)
    df = pd.concat([df, expanded_df], axis = 1)
    df.drop([column_name], axis=1, inplace=True)
    return df

def replace_column_nan(df, column_name, index_of_value_count):
    df[column_name] = df[column_name].fillna(df[column_name].value_counts().index[index_of_value_count])

In [None]:
view_column_values(business_df, 'stars')

In [None]:
business_df['stars'].isnull().sum()

#### Replacing all the NaN

In [None]:
business_df_replace['attributes_Ambience'] = business_df_replace['attributes_Ambience'].fillna(ambience_default)
business_df_replace =expand_dict_to_columns(business_df_replace, 'attributes_Ambience')
replace_column_nan(business_df_replace, 'attributes_Alcohol', 0)      # default full_bar, to change to none, change last parameter to 1 
replace_column_nan(business_df_replace, 'attributes_BikeParking', 0)  # default yes parking
replace_column_nan(business_df_replace, 'attributes_BusinessAcceptsCreditCards', 1)  # default True
replace_column_nan(business_df_replace, 'attributes_BusinessParking', 0)  # default just lot parking
business_df_replace = expand_dict_to_columns(business_df_replace, 'attributes_BusinessParking')
replace_column_nan(business_df_replace, 'attributes_Caters', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_GoodForKids', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_HasTV', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_NoiseLevel', 0)  # default Average
replace_column_nan(business_df_replace, 'attributes_OutdoorSeating', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_GoodForMeal', 0)  # default good for lunch and dinner
business_df_replace = expand_dict_to_columns(business_df_replace, 'attributes_GoodForMeal')
replace_column_nan(business_df_replace, 'attributes_RestaurantsAttire', 0)  # default casual
replace_column_nan(business_df_replace, 'attributes_RestaurantsDelivery', 0)  # default false
replace_column_nan(business_df_replace, 'attributes_RestaurantsGoodForGroups', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsPriceRange2', 0)  # default 2$ signs
replace_column_nan(business_df_replace, 'attributes_RestaurantsReservations', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTakeOut', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_WheelchairAccessible', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_WiFi', 0)  # default free
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true

#### Changing categorical input to numerical

In [None]:
for col in business_df_replace.columns:
    if col not in ['business_id', 'stars']: 
        if business_df_replace[col].dtypes == bool:
            # true becomes 1, false becomes 0
            business_df_replace[col] *=1
        elif business_df_replace[col].dtypes != np.dtype('int32') and business_df_replace[col].dtypes != np.dtype('int64') and business_df_replace[col].dtypes != float: 
            #changes categorical values to numerical values
            business_df_replace[col] = business_df_replace[col].astype('category').cat.codes


In [None]:
#note, the business_id column is not numerical, I kept it the same
business_df_replace.isnull().sum()

In [None]:

# USE THIS
business_df_replace.drop(['attributes_BikeParking',
       'attributes_BusinessAcceptsCreditCards', 'attributes_Caters',
       'attributes_GoodForKids', 'attributes_HasTV', 'attributes_NoiseLevel',
       'attributes_OutdoorSeating', 'attributes_RestaurantsAttire',
       'attributes_RestaurantsDelivery', 'attributes_RestaurantsGoodForGroups',
       'attributes_RestaurantsReservations',
       'attributes_RestaurantsTakeOut','attributes_WiFi', 'garage', 'lot', 'street', 'valet',
       'validated', 'state',
#        'casual', 'classy', 'hipster', 'intimate', 'romantic', 'touristy',
#        'trendy', 'upscale', 'divey', 'breakfast', 'brunch', 'dessert',
#        'dinner', 'latenight', 'lunch',
                          'attributes_RestaurantsTableService', 'attributes_WheelchairAccessible'], axis=1, inplace=True)
business_df_replace.columns

## Preprocessing User data
The users are mostly good for numerical features. However the following features 'elite', 'friends', 'name', 'yelping_since' are not but I decided to drop them as they do not intuitively seem super important. 

In [None]:
users_df = pd.read_csv("all/users.csv")
users_df_replace = users_df.copy()
users_df_replace.drop([ 'compliment_cool', 'compliment_cute', 'compliment_funny', 'compliment_hot', 'compliment_list', 'compliment_more',
                       'compliment_note', 'compliment_photos', 'compliment_plain', 'compliment_profile', 'compliment_writer','name',
                       'friends', 'fans',
                       'yelping_since', 
                       'elite',
                       'cool', 'funny',
                      ], axis=1, inplace=True)
# users_df_replace['elite'] = (users_df_replace['elite'] == True).astype(int)
# users_df_replace['yelping_since'] = users_df_replace['yelping_since'].map(lambda x: pd.to_datetime(x).timestamp())
users_df_replace.columns


In [None]:
#check that all data is numerical, should output nothing if it is
for col in users_df_replace.columns:
    if col not in ['user_id']: 
        if users_df_replace[col].dtypes != np.dtype('int32') and users_df_replace[col].dtypes != np.dtype('int64') and users_df_replace[col].dtypes != float:
            print(col)

In [None]:
#check that there are no Nan values
users_df_replace.isnull().sum()

In [None]:
# use this
users_df_replace

In [None]:
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# cols_to_norm = ['review_count', 'useful']
# for col in cols_to_norm:
#     users_df_replace[col] = sc.fit_transform(users_df_replace[col])
# users = sc.fit_transform(users_df_replace)
# cols = users_df_replace.columns
# ids = users_df_replace['user_id']
# avg_stars = users_df_replace['average_stars']
# users = users_df_replace.copy()
# users.drop(['average_stars', 'user_id'], axis=1, inplace=True)

# users_clean = pd.DataFrame(columns=cols)
# users_df_replace['review_count']
# users_df_replace[cols_to_norm] = sc.fit_transform(users_df_replace[cols_to_norm])
# users_df_replace
# sc.fit_transform([users_df_replace['review_count']])
# users_df_replace['review_count']
# y_train = sc.fit_transform(test_df_x)

## Preprocessing Review Data

For review Data we will use the user id and buisiness id as reference and do a join with our business and user dataframes. This will the basis of our training dataset

In [None]:
sample_submission = pd.read_csv("all/sample_submission.csv")
train_reviews = pd.read_csv("all/train_reviews.csv")
train_reviews = train_reviews[['user_id', 'business_id', 'stars']]
train_reviews

In [None]:
train_reviews_replace = train_reviews.copy()
reviews_denorm = pd.merge(train_reviews_replace, users_df_replace, how='left', on='user_id')
# reviews_denorm
reviews_denorm = pd.merge(reviews_denorm, business_df_replace, how='inner', on='business_id')
business_df_replace.columns
reviews_denorm = reviews_denorm.rename(columns={'stars_x': 'review_stars', 'stars_y': 'business_stars'})
train_df_y = reviews_denorm['review_stars']
train_df_x = reviews_denorm.copy()
train_df_x.drop(['review_stars', 'business_id', 'user_id'], axis=1, inplace=True)


In [None]:
reviews_denorm.groupby('review_stars').size()

In [None]:
#validate df will be our validation set
validate_df = pd.read_csv("all/validate_queries.csv")
validate_df_denorm = pd.merge(validate_df, users_df_replace, how='left', on='user_id')
validate_df_denorm = pd.merge(validate_df_denorm, business_df_replace, how='left', on='business_id')
validate_df_denorm = validate_df_denorm.rename(columns={'stars_x': 'review_stars', 'stars_y': 'business_stars'})
test_df_y = validate_df_denorm['review_stars']
test_df_x = validate_df_denorm.copy()
test_df_x.drop(['Unnamed: 0', 'review_stars', 'business_id', 'user_id'], axis=1, inplace=True)

# train_df_x = pd.concat([train_df_x, test_df_x])
# train_df_y = pd.concat([train_df_y, test_df_y])
# test_df_x.columns, train_df_x.columns

In [None]:
# usefull should be normalized by the number of reviews a user has given
train_df_x['useful'] = train_df_x['useful'] / train_df_x['review_count_x']
test_df_x['useful'] = test_df_x['useful'] / test_df_x['review_count_x']

In [None]:
from sklearn.preprocessing import StandardScaler
#z score normalize our inputs to our model 
scaler = StandardScaler()
replace_column_nan(train_df_x, 'useful', 0) 
scaler.fit(train_df_x)
replace_column_nan(train_df_x, 'useful', 0) 
train_x = scaler.transform(train_df_x)
replace_column_nan(test_df_x, 'useful', 0) 
scaler.fit(test_df_x)
test_x = scaler.transform(test_df_x)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, make_scorer, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score


# for max_itr in [200, 500, 700]:
#     for alph in [1e-4, 1e-3, 1e-2]:
#         reviews_denorm.shape
#         clf = MLPClassifier(solver='sgd', alpha=alph, hidden_layer_sizes=(100, 100, 8), learning_rate='adaptive', learning_rate_init= 0.0001, max_iter=max_itr)
#         clf.fit(train_df_x, train_df_y.values)      
#         y_pred = clf.predict(test_df_x)

# clf = MLPRegressor(solver='sgd', alpha=1e-4, hidden_layer_sizes=(50, 50, 8), learning_rate='adaptive', learning_rate_init= 0.0001, max_iter=200)
# clf.fit(train_df_x, train_df_y.values)      
# y_pred = clf.predict(test_df_x)
# y_pred_train = clf.predict(train_df_x)

for max_itr in [200, 500, 700, 1000]:
    for alph in [1e-4, 1e-3, 5e-3, 1e-2]:
        for hidden_layer_size in [25, 50, 100]:
            print("Reporting Stats: max_iter: {}, alpha: {}, hidden_layer_size: {}".format(max_itr, alph, hidden_layer_size))
            network_topography = (hidden_layer_size, hidden_layer_size, int(hidden_layer_size/5))
            clf = MLPRegressor(solver='sgd', alpha=alph, hidden_layer_sizes=network_topography, learning_rate='adaptive', learning_rate_init=0.0001, max_iter=max_itr)
            clf.fit(train_x, train_df_y.values) 
            y_pred_train = clf.predict(train_df_x).round(decimals=0)
            y_pred = clf.predict(test_x).round(decimals=0)
            print("training_report")
            report_mse_accuracy(train_df_y.values, y_pred_train)
            print("validation_report")
            report_mse_accuracy(test_df_y.values, y_pred)
            print("\n\n")
# print(classification_report(test_df_))
def report_mse_accuracy(y_true, y_pred):
    #print(classification_report(y_true, y_pred)) # print classification report
    print("Validation MSE {}".format(mean_squared_error(y_true, y_pred)))
    print("Accuracy{}".format(y_true, y_pred))
    return accuracy_score(y_true, y_pred) # return accuracy score


# for k in [15, 25, 40, 55, 80, 125, 250, 350, 500, 1000]:
#     classifier = KNeighborsClassifier(n_neighbors=k, algorithm='auto', weights='distance') 
#     classifier.fit(train_df_x.values, train_df_y.values)
#     scores = cross_val_score(classifier, X=train_df_x, y=train_df_y, cv=5, \
#                scoring=make_scorer(classification_report_with_accuracy_score))
#     print(k, scores)
#     y_pred = classifier.predict(test_df_x)
#     print(k, classification_report(test_df_y.values, y_pred))

# train_df_x.drop(['yelping_since'], axis=1, inplace=True)
# train_df_x

# for depth in (7, 10, 15):
# print(7)
# regressor = RandomForestClassifier(n_estimators=150, max_depth=7, min_samples_split=5)
# regressor.fit(train_df_x, train_df_y)
# scores = cross_val_score(regressor, X=train_df_x, y=train_df_y, cv=5, \
#            scoring=make_scorer(classification_report_with_accuracy_score))
# print(10)
# regressor = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_split=5)
# regressor.fit(train_df_x, train_df_y)
# scores = cross_val_score(regressor, X=train_df_x, y=train_df_y, cv=5, \
#            scoring=make_scorer(classification_report_with_accuracy_score))
# print(15)
# regressor = RandomForestClassifier(n_estimators=150, max_depth=15, min_samples_split=5)
# regressor.fit(train_df_x, train_df_y)
# scores = cross_val_score(regressor, X=train_df_x, y=train_df_y, cv=5, \
#            scoring=make_scorer(classification_report_with_accuracy_score))
#     print(depth, scores)

#y_pred = regressor.predict(test_df_x.values)

# print(classification_report(test_df_y.values, y_pred))



In [None]:
train_df_x

In [None]:
#mean_squared_error(test_df_y.values, y_pred.round(decimals=0))
y_pred_train = clf.predict(train_df_x)
mean_squared_error(train_df_y.values, y_pred_train.round(decimals=0))

In [None]:
y_pred.round(decimals=0)

In [None]:
validate_df_denorm

In [None]:
train_x

In [None]:
df = pd.Series(y_pred)
df.value_counts()

In [None]:
test = pd.read_csv("all/test_queries.csv")
test = pd.merge(test, users_df_replace, how='left', on='user_id')
test = pd.merge(test, business_df_replace, how='left', on='business_id')

test = test.rename(columns={'stars': 'business_stars'})
# test.columns
# test_x = test.copy()
test.drop(['business_id', 'user_id'], axis=1, inplace=True)
submit_y = regressor.predict(test.values)
# submit_y = pd.DataFrame(submit_y)


In [None]:
#Run to put in submit format

submit = pd.DataFrame(columns=['stars'])
submit['stars'] = submit_y

submit.index.name = 'index'
submit.to_csv('submit.csv')

In [None]:
test = pd.read_csv("all/test_queries.csv")
test = pd.merge(test, users_df_replace, how='left', on='user_id')
test = pd.merge(test, business_df_replace, how='left', on='business_id')

In [None]:
test