In [6]:
import numpy as np
import pandas as pd
import json
import time
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [7]:
pd.set_option('display.max_columns', None)

### How do we predict what rating a user is going to give?
1) Business side:
    - location
    - type
    - hours
    - parking availability
    - ambience
    - takeout?
2) User side:
    - how long on yelp
    - elite status
    - review count
    - useful/funny/cool
    - home

# TODO
    - Matthew
        - isolate restaurants in business_df
        - preprocessing: marking reviews as good or bad
    - feature extraction
        - hometown
        - number of tips by user

    - other preprocessing
        - yelpin since: timestamp today - yelping since --- COMPLETE
        - elite status: how many years have they been elite
        - only consider business with certain number of reviews
        - only consider "active users"

# Next
    - what happens if we remove "inactive" users

        

In [8]:
business_df = pd.read_csv("sample_restaurant.csv")
checkin_df = pd.read_csv("sample_checkin.csv")
review_df = pd.read_csv("sample_review.csv")
tip_df = pd.read_csv("sample_tip.csv")
user_df = pd.read_csv("sample_user.csv")

In [9]:
business_df.drop(columns = 'Unnamed: 0', inplace = True)
business_loc = business_df.iloc[:, [0,3, 4, 5, 6, 7]]
user_useful = user_df.iloc[:, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]]
user_useful.set_index('user_id', inplace = True)
review_df.drop(columns = 'Unnamed: 0', inplace=True)
rJoinU = review_df.join(user_useful, on = 'user_id', lsuffix='_Rev', rsuffix = '_User')
rJoinU.dropna(inplace = True)
business_loc.set_index('business_id', inplace= True)
rJUJBloc = rJoinU.join(business_loc, on='business_id', lsuffix='_rJU', rsuffix='_bloc')
rJUJBloc.dropna(inplace= True)
hometown = rJUJBloc.groupby(['user_id'])['city'].agg(lambda x: pd.Series.mode(x)[0]).to_frame()

In [10]:
hometown

Unnamed: 0_level_0,city
user_id,Unnamed: 1_level_1
--_r6E98SNIrGU7weyNxbw,West Chester
--rpFxc_x14BWF708pfR6Q,Philadelphia
-0U1fhFq9zl4AyKafrKHFw,Nashville
-0aInSHjCWLfiNqfgmWnow,Philadelphia
-0aZWYi2YicFaLxTru96nA,Philadelphia
...,...
zwXmvn1op5LuFF2Kveqaug,Indianapolis
zxNtaKTMzZzum8ek2v4UXg,Reno
zxuxd6Hz2tKcpgZ71dYEcw,Edmonton
zyNrXvJyYdC34tS6BcCykA,Philadelphia


In [11]:
data = rJUJBloc.merge(hometown, how = 'left', on="user_id", suffixes=('_x', '_y'))
data['is_home'] = (data['city_x'] == data['city_y']).astype(int)

## Yelping Since

In [12]:
data['time_since'] = (pd.Timestamp(2022, 11, 29) - pd.to_datetime(data.date)).dt.days
merged = business_df.merge(review_df, on='business_id')
merged.dropna(inplace=True)
merged['goodReview'] = (merged['stars_x'] < merged['stars_y']).astype(int)
goodRev = merged.loc[:, ['review_id','business_id', 'stars_x', 'goodReview']]
goodRev.rename(columns={'stars_x': 'stars_bus'}, inplace=True)
dataFull = data.merge(goodRev, on='review_id')

#### Using features: 
    - star rating of review
    - review count of user
    - useful of user
    - funny of user
    - cool of user
    - # fans of user
    - average star rating OF USER
    - is hometown
    - time since joining yelp

## TODO:
    - try a 70-30 train test split and evaluate performance
    - extract elite
    - extract # of friends

# Elite

In [13]:
elite = data.loc[:, ['user_id', 'elite']]
elite['glitch'] = elite['elite'].str.contains('20,20').astype(int)
elite['elite'] = elite['elite'].str.split(',')
elite['count'] = elite['elite'].apply(lambda x: len(x))
elite['trueCount'] = elite['count'] - elite['glitch']
elite.drop(columns=['elite', 'glitch', 'count'], inplace = True)
elite = elite.rename(columns={'trueCount': 'eliteCount'})
elite = elite.drop_duplicates(subset = 'user_id')
dataFull = dataFull.merge(elite, how = 'left', on = 'user_id')
friends = data.loc[:, ['user_id', 'friends']]
friends['numFriends'] = friends.friends.str.split(',').apply(lambda x: len(x))
friends.drop(columns = 'friends', inplace = True)
friends = friends.drop_duplicates(subset='user_id')
dataFull = dataFull.merge(friends, how = 'left', on = 'user_id')
dataFull = dataFull.drop_duplicates(subset='review_id')
business_df.loc[1, 'attributes']

'{\'RestaurantsReservations\': \'True\', \'WiFi\': "u\'free\'", \'RestaurantsTakeOut\': \'True\', \'GoodForMeal\': "{\'dessert\': False, \'latenight\': False, \'lunch\': False, \'dinner\': False, \'brunch\': False, \'breakfast\': False}", \'Alcohol\': "u\'full_bar\'", \'BusinessAcceptsCreditCards\': \'True\', \'HappyHour\': \'True\', \'RestaurantsTableService\': \'True\', \'Ambience\': "{\'touristy\': False, \'hipster\': False, \'romantic\': False, \'divey\': False, \'intimate\': False, \'trendy\': False, \'upscale\': False, \'classy\': False, \'casual\': False}", \'BusinessParking\': "{\'garage\': False, \'street\': False, \'validated\': False, \'lot\': True, \'valet\': False}", \'HasTV\': \'True\', \'OutdoorSeating\': \'False\'}'

In [14]:
# Restauran attributes from Jesse's data
restaurant_df = pd.read_csv('final_restaurant.csv')
onehot = restaurant_df.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count' ])
dataFull.rename(columns={'business_id_x':'business_id'}, inplace = True)
dataFullFull = dataFull.merge(onehot, how = 'left', on = 'business_id')

In [15]:
dataFullFull.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful_Rev,funny_Rev,cool_Rev,text,date,name,review_count,yelping_since,useful_User,funny_User,cool_User,elite,friends,fans,average_stars,city_x,state,postal_code,latitude,longitude,city_y,is_home,time_since,business_id_y,stars_bus,goodReview,eliteCount,numFriends,is_open,RestaurantsDelivery_False,RestaurantsDelivery_True,OutdoorSeating_False,OutdoorSeating_True,BusinessAcceptsCreditCards_True,BikeParking_True,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsTakeOut_True,WiFi_u'free',WiFi_u'no',Alcohol_u'full_bar',Alcohol_u'none',Caters_False,Caters_True,RestaurantsAttire_'casual',RestaurantsAttire_u'casual',RestaurantsReservations_False,RestaurantsReservations_True,GoodForKids_True,RestaurantsTableService_True,RestaurantsGoodForGroups_True,WheelchairAccessible_True,HasTV_False,HasTV_True,NoiseLevel_u'average',dessert_False,latenight_False,lunch_False,lunch_True,dinner_False,dinner_True,brunch_False,breakfast_False,romantic_False,intimate_False,touristy_False,hipster_False,divey_False,classy_False,trendy_False,upscale_False,casual_False,casual_True,garage_False,street_False,street_True,validated_False,lot_False,lot_True,valet_False,Food Trucks,Juice Bars & Smoothies,Tacos,Soup,Arts & Entertainment,Ice Cream & Frozen Yogurt,Beer,Wine & Spirits,Vegetarian,Mediterranean,Pubs,Cocktail Bars,Diners,Steakhouses,Asian Fusion,Barbeque,Sushi Bars,Sports Bars,Japanese,Desserts,Bakeries,Specialty Food,Caterers,Delis,Cafes,Chicken Wings,Salad,Event Planning & Services,Chinese,Seafood,Coffee & Tea,Italian,Mexican,Burgers,American (New),Breakfast & Brunch,Fast Food,Pizza,American (Traditional),Bars,Sandwiches,Nightlife,Food
0,NILjwITiFhpdhiE3SiHR8Q,WUgxsRUFjATha8L4qWYDww,fbQkVymvZ0dig8umltKhZQ,3,2,1,1,They give you LOTS of food and you will not go...,2020-09-23 21:49:15,Glenn,696,2011-07-28 01:14:09,975,329,552,"2012,2013,2014,2015,2016,2017,2018,2019,20,20,...","dXD3TU8-FR4NnIzlkRk_3g, nSRPpreSnTaAMrEx0YvdnA...",31,3.57,Tampa,FL,33609,27.948237,-82.527587,Tampa,1,796,fbQkVymvZ0dig8umltKhZQ,4.0,0,10,168,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,hOwUw1fz3aB4ZhgqCetGTw,fr1Hz2acAb3OaL3l6DyKNg,RewAum_fQ0lXdFoUX18J0g,5,17,4,13,It is 4.5 stars. Not yet 5 stars. Mel has a go...,2020-08-18 19:09:46,Boon,3109,2014-05-10 14:13:19,31200,9602,27824,20142015201620172018201920202021,"tuoDDqTDCB-lDgelvMrp5Q, 0yyXKmdI9lHxFKzomy6yfw...",387,4.04,Tampa,FL,33610,28.031316,-82.451534,Tampa,1,832,RewAum_fQ0lXdFoUX18J0g,5.0,0,8,825,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,bbVN5xloXk4Bzf6AF0wwMw,8fkeTZoaojIL3GIHvvTCyA,LdECsE8lJS7v5GTFTcjPSg,4,0,0,0,"Before reading on, make sure id you eat here y...",2018-12-16 00:07:13,Amy,198,2017-04-30 20:46:12,133,24,174,201920202021,"vv1MjgCvZUbThE7oFJRCcQ, jt49xjEjQisu6wTTGn6B3A...",9,4.15,St. Pete Beach,FL,33706,27.725209,-82.742302,St. Pete Beach,1,1443,LdECsE8lJS7v5GTFTcjPSg,4.0,0,3,34,1,0,1,0,1,1,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,OQL_x9smctWsVq_qQTOlcg,_zC91aGLwBjnMJji-rpG9A,cGX-1IUwXOjkUqZbkKYcjw,5,8,0,3,"Fogo De Chao\r\n1337 Chestnut Street, Philadel...",2015-11-03 03:01:00,John,95,2014-09-13 23:39:11,202,34,43,20152016,"pv-q73AyQSG_nLXSziZxwA, 17vVd5zdMMSD8ESJ51InjA...",16,4.02,Philadelphia,PA,19107,39.950917,-75.162971,Philadelphia,1,2582,cGX-1IUwXOjkUqZbkKYcjw,4.0,1,2,68,1,0,1,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,P-tNl7Z4lBq-nPvzU0S2Iw,pUNvLJwdJIOemOgU98mp1w,2HxkdqHmbYGj_BH1bLaiSw,4,5,2,3,Nektar is a beautiful wine bar right in the ce...,2020-04-10 12:53:26,Amy,488,2011-07-24 03:16:25,1191,235,614,"2012,2013,2014,2015,2016,2017,2018,2019,20,20,...","1Px8_etTU-O0NTBLemwDJA, 404akRRXmNzHixRDcFT7rA...",45,4.14,New Hope,PA,18938,40.362115,-74.950957,Lansdale,0,962,2HxkdqHmbYGj_BH1bLaiSw,4.0,0,10,335,1,0,1,0,1,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


# Model

In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [17]:
# dataUseful = dataFull.loc[:, ['goodReview', 'review_count', 'useful_User', 'funny_User', 'cool_User', 'fans', 'average_stars', 'is_home', 'time_since', 'eliteCount', 'numFriends']]
dataUseful = dataFullFull.drop(columns = ['review_id', 'user_id', 'business_id', 'stars', 'text', 'date', 'name', 'yelping_since', 'elite', 'friends', 'city_x', 'state', 'postal_code', 'latitude', 'longitude', 'city_y', 'business_id_y', 'stars_bus' ])

In [19]:
simpleData = dataFull.loc[:, ['goodReview', 'review_count', 'useful_User', 'funny_User', 'cool_User', 'fans', 'average_stars', 'is_home', 'time_since', 'eliteCount', 'numFriends']]

In [None]:
corrmat = dataUseful.corr()
f, ax = plt.subplots(figsize = (12, 10))
mask = np.triu(np.ones_like(corrmat, dtype=bool))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(corrmat, annot=True, mask = mask, cmap = cmap)
plt.show()
#f

In [102]:
firstcol = dataUseful.pop('goodReview')
dataUseful.insert(0, 'goodReview', firstcol)

Unnamed: 0,goodReview,useful_Rev,funny_Rev,cool_Rev,review_count,useful_User,funny_User,cool_User,fans,average_stars,is_home,time_since,eliteCount,numFriends,is_open,RestaurantsDelivery_False,RestaurantsDelivery_True,OutdoorSeating_False,OutdoorSeating_True,BusinessAcceptsCreditCards_True,BikeParking_True,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsTakeOut_True,WiFi_u'free',WiFi_u'no',Alcohol_u'full_bar',Alcohol_u'none',Caters_False,Caters_True,RestaurantsAttire_'casual',RestaurantsAttire_u'casual',RestaurantsReservations_False,RestaurantsReservations_True,GoodForKids_True,RestaurantsTableService_True,RestaurantsGoodForGroups_True,WheelchairAccessible_True,HasTV_False,HasTV_True,NoiseLevel_u'average',dessert_False,latenight_False,lunch_False,lunch_True,dinner_False,dinner_True,brunch_False,breakfast_False,romantic_False,intimate_False,touristy_False,hipster_False,divey_False,classy_False,trendy_False,upscale_False,casual_False,casual_True,garage_False,street_False,street_True,validated_False,lot_False,lot_True,valet_False,Food Trucks,Juice Bars & Smoothies,Tacos,Soup,Arts & Entertainment,Ice Cream & Frozen Yogurt,Beer,Wine & Spirits,Vegetarian,Mediterranean,Pubs,Cocktail Bars,Diners,Steakhouses,Asian Fusion,Barbeque,Sushi Bars,Sports Bars,Japanese,Desserts,Bakeries,Specialty Food,Caterers,Delis,Cafes,Chicken Wings,Salad,Event Planning & Services,Chinese,Seafood,Coffee & Tea,Italian,Mexican,Burgers,American (New),Breakfast & Brunch,Fast Food,Pizza,American (Traditional),Bars,Sandwiches,Nightlife,Food
0,0,2,1,1,696,975,329,552,31,3.57,1,796,10,168,1,1,0,0,0,1,1,1,0,1,0,0,0,1,0,0,1,0,1,0,1,0,1,0,0,1,0,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,17,4,13,3109,31200,9602,27824,387,4.04,1,832,8,825,1,0,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,198,133,24,174,9,4.15,1,1443,3,34,1,0,1,0,1,1,1,0,1,1,0,0,1,0,0,1,1,0,0,1,1,1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,8,0,3,95,202,34,43,16,4.02,1,2582,2,68,1,0,1,0,1,1,0,0,0,1,0,1,0,0,0,1,1,0,0,1,0,0,1,0,1,0,1,1,1,0,0,0,1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,5,2,3,488,1191,235,614,45,4.14,0,962,10,335,1,0,1,0,1,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,0,0,1,1,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9898,1,2,0,0,264,502,37,164,10,4.23,0,900,4,38,1,0,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,1,1,1,1,1,0,1,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
9899,1,16,2,13,231,1876,630,1389,167,4.33,1,322,2,2048,1,0,0,1,0,1,1,0,1,1,1,0,1,0,0,1,0,0,1,0,0,0,1,1,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9900,1,0,0,0,104,108,54,45,4,4.53,1,1109,3,246,1,0,1,0,1,1,1,1,0,1,1,0,1,0,1,0,0,1,0,1,0,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,0,0,1,0,1,0,0,1,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0
9901,0,0,0,0,146,160,10,106,12,4.26,1,2301,5,109,1,0,1,0,1,1,1,0,1,1,0,1,0,0,0,1,1,0,1,0,1,1,1,1,0,1,1,0,1,0,1,0,1,0,1,1,1,1,0,1,1,0,1,0,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [21]:
y = simpleData['goodReview']
X = simpleData.loc[:, 'review_count':]
scaler = MinMaxScaler()
X_ss = scaler.fit_transform(X)

In [22]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_ss, y, test_size=0.3)

In [31]:
y.value_counts()

0    5050
1    4853
Name: goodReview, dtype: int64

### KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
searchSpace = {
    'n_neighbors': range(1,25)
}
gs = GridSearchCV(KNeighborsClassifier(), searchSpace, cv=5, scoring = 'f1_macro')
gs.fit(xtrain, ytrain)
print(gs.best_estimator_)

KNeighborsClassifier(n_neighbors=21)


In [27]:
neigh = KNeighborsClassifier(n_neighbors=21)
neigh.fit(xtrain, ytrain)
yhat = neigh.predict(xtest)

In [28]:
knn_acc = accuracy_score(ytest, yhat)
knn_f1 = f1_score(ytest, yhat)
knn_recall = recall_score(ytest, yhat)
knn_precision = precision_score(ytest, yhat)

In [29]:
print(knn_acc)
print(knn_f1)
print(knn_recall)
print(knn_precision)

0.5647929989902389
0.5467928496319664
0.5401662049861495
0.553584102200142


### Logistic

In [32]:
from sklearn.linear_model import LogisticRegression

In [34]:
clfl2 = LogisticRegression(penalty='l2').fit(xtrain, ytrain)
clfl1 = LogisticRegression(penalty='l1', solver = 'saga').fit(xtrain, ytrain)
clfelastic = LogisticRegression(penalty='elasticnet', solver = 'saga', l1_ratio = 0.5).fit(xtrain, ytrain)
clf = LogisticRegression(penalty='none').fit(xtrain, ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
yhatl2 = clfl2.predict(xtest)
yhatl1 = clfl1.predict(xtest)
yhatelastic = clfelastic.predict(xtest)
yhatlogit = clf.predict(xtest)

In [36]:
models = [yhatl2, yhatl1, yhatelastic, yhatlogit]
for yhat in models:
    logit_acc = accuracy_score(ytest, yhat)
    logit_f1 = f1_score(ytest, yhat)
    logit_recall = recall_score(ytest, yhat)
    logit_precision = precision_score(ytest, yhat)
    print(logit_acc)
    print(logit_f1)
    print(logit_recall)
    print(logit_precision)
    print("-----------")

0.6035005048805117
0.5878236529041289
0.5817174515235457
0.594059405940594
-----------
0.6045102659037361
0.5893044390073401
0.5837950138504155
0.5949188426252646
-----------
0.6038370918882531
0.5880294014700734
0.5817174515235457
0.5944798301486199
-----------
0.6028273308650286
0.5879888268156424
0.5831024930747922
0.5929577464788732
-----------


### Decision Tree

In [37]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
searchSpace = {
    'max_depth': range(1,20),
    'min_samples_leaf': range(1,20)
}
gs = GridSearchCV(DecisionTreeClassifier(), searchSpace, cv=5, scoring = 'f1_macro')
gs.fit(xtrain, ytrain)
print(gs.best_estimator_)

In [145]:
dt = DecisionTreeClassifier(max_depth=4, min_samples_leaf=12)
dt.fit(xtrain, ytrain)
yhat = dt.predict(xtest)

In [146]:
dt_acc = accuracy_score(ytest, yhat)
dt_f1 = f1_score(ytest, yhat)
dt_recall = recall_score(ytest, yhat)
dt_precision = precision_score(ytest, yhat)

In [147]:
print(dt_acc)
print(dt_f1)
print(dt_recall)
print(dt_precision)

0.586334567485695
0.5596560372626299
0.5371389270976616
0.5841436050860135


### SVM

In [148]:
from sklearn.svm import SVC

In [149]:
searchSpace = {'C': [0.1,1, 10, 100], 
                'gamma': [1,0.1,0.01,0.001],
                'kernel': ['rbf', 'poly', 'sigmoid']
}

In [150]:
gs = GridSearchCV(SVC(), searchSpace, cv = 5, scoring = 'f1_macro')
gs.fit(xtrain, ytrain)

In [None]:
print(gs.best_estimator_)

SVC(C=1, gamma=0.1, kernel='sigmoid')


In [246]:
svm = SVC(C = 1, gamma = 0.1, kernel = 'sigmoid')
svm.fit(xtrain, ytrain)
yhat = svm.predict(xtest)

In [247]:
svm_acc = accuracy_score(ytest, yhat)
svm_f1 = f1_score(ytest, yhat)
svm_recall = recall_score(ytest, yhat)
svm_precision = precision_score(ytest, yhat)

In [248]:
print(svm_acc)
print(svm_f1)
print(svm_recall)
print(svm_precision)

0.580949175361831
0.5471080392870135
0.50708024275118
0.593996840442338


### Random Forest

In [139]:
from sklearn.ensemble import RandomForestClassifier

In [141]:
searchSpace = {
    'n_estimators': range(10,15),
    'max_depth': range(1,10),
    'min_samples_leaf': range(1,20)
}

In [142]:
gs = GridSearchCV(RandomForestClassifier(), searchSpace, cv=5, scoring='f1_macro')
gs.fit(xtrain, ytrain)

KeyboardInterrupt: 

In [None]:
print(gs.best_estimator_)

In [143]:
rf = RandomForestClassifier(n_estimators = 15, max_depth = 5, min_samples_leaf = 15)
rf.fit(xtrain, ytrain)

RandomForestClassifier(max_depth=5, min_samples_leaf=15, n_estimators=15)

In [144]:
yhat = rf.predict(xtest)

In [145]:
rf_acc = accuracy_score(ytest, yhat)
rf_f1 = f1_score(ytest, yhat)
rf_recall = recall_score(ytest, yhat)
rf_precision = precision_score(ytest, yhat)

In [146]:
print(rf_acc)
print(rf_f1)
print(rf_recall)
print(rf_precision)

0.5954224166947156
0.5487987987987988
0.5044858523119393
0.6016460905349794


### Naive Bayes

In [172]:
from sklearn.linear_model import Perceptron

In [173]:
perceptron = Perceptron()
perceptron.fit(xtrain, ytrain)

Perceptron()

In [174]:
yhat = perceptron.predict(xtest)

In [179]:
perceptron.score(xtrain, ytrain)

0.5049047893825735

In [176]:
accuracy_score(yhat, ytest)

0.49612924941097275

In [178]:
f1_score(yhat, ytest)

0.6508047585724283