## Course Project Jupyter Notebook

#### Data Files 
- business.csv 
- sample_submission.csv
- test_queries.csv
- train_reviews.csv
- user.csv
- validate_queries.csv

In [3]:
import pandas as pd
import numpy as np

## Preprocessing Business Data

Expects the csv file to be in an "all" folder in the working directory of this notebook

In [4]:
business_df = pd.read_csv("all/business.csv", engine="python")
business_df_replace = business_df.copy()
# default value to replace for ambience when it is Nan
ambience_default = str({'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': False})
business_df

Unnamed: 0,address,attributes,attributes_AcceptsInsurance,attributes_AgesAllowed,attributes_Alcohol,attributes_Ambience,attributes_BYOB,attributes_BYOBCorkage,attributes_BestNights,attributes_BikeParking,...,hours_Wednesday,is_open,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state
0,595 Markham Street,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,17:0-1:0,0,43.664125,-79.411886,Southern Accent Restaurant,Palmerston,M6G 2L7,146,4.0,ON
1,2801 N 15th Ave,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",False,yes_free,"{'monday': False, 'tuesday': True, 'friday': T...",True,...,11:0-22:0,1,33.479807,-112.091188,Original Hamburger Works,,85007,277,4.0,AZ
2,"5508 County Rd N, Ste 3",,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,False,...,11:0-0:0,1,43.149488,-89.206641,Chicken Lips,,53590,102,4.5,WI
3,2227 N Rampart Blvd,,,,beer_and_wine,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,7:0-19:0,1,36.201990,-115.283122,Omelet House Summerlin,Summerlin,89128,242,4.0,NV
4,1111 W Bell Rd,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,11:0-22:0,1,33.639774,-112.087738,Manuel's Mexican Restaurant & Cantina - Bell Rd,,85023,230,3.5,AZ
5,1001 New Beginnings Dr,,,,,,,,,,...,9:0-17:0,1,36.080453,-115.038166,Central Church - Henderson,,89011,113,4.0,NV
6,5440 Walnut St,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,11:0-22:0,1,40.450866,-79.933919,China Palace,Shadyside,15232,110,3.0,PA
7,81 Underhill Drive,,,,none,"{'romantic': False, 'intimate': False, 'classy...",,,,True,...,11:30-22:0,1,43.745928,-79.324623,Allwyn's Bakery,,M3A 1K8,105,4.0,ON
8,2523 South Blvd,,,,,,,,,,...,20:0-6:0,1,35.202363,-80.864662,Long Animal Hospital,South End,28203,103,3.5,NC
9,6316 N Scottsdale Rd,,,,full_bar,"{'romantic': False, 'intimate': False, 'classy...",,,"{'monday': False, 'tuesday': False, 'friday': ...",False,...,16:30-21:30,1,33.530358,-111.925905,Fat Ox,,85253,234,4.0,AZ


### Feature Selection

I decided that we are only attempting to use features when more than half of its values are not Nan as otherwise, there are too less datapoints with a value. This percent non-Nan requirement can be changed nevertheless. 

I also decided to not look at the hours as it seems to complex to make into numerical value and would not help much in determining a user's review from intuition. The same is the case with the latitude, longitude, name, and address features. 

In [5]:
business_df_replace.drop([col for col in business_df.columns if business_df[col].isnull().sum() > 0.5 * 12058 ], axis=1, inplace=True)
business_df_replace.drop(['hours_Friday', 'hours_Monday', 'hours_Saturday',
       'hours_Sunday', 'hours_Thursday', 'hours_Tuesday', 'hours_Wednesday',
       'is_open', 'latitude', 'longitude', 'postal_code', 'name', 'address'], axis=1, inplace=True)
business_df_replace

Unnamed: 0,attributes_Alcohol,attributes_Ambience,attributes_BikeParking,attributes_BusinessAcceptsCreditCards,attributes_BusinessParking,attributes_Caters,attributes_GoodForKids,attributes_GoodForMeal,attributes_HasTV,attributes_NoiseLevel,...,attributes_RestaurantsTableService,attributes_RestaurantsTakeOut,attributes_WheelchairAccessible,attributes_WiFi,business_id,categories,city,review_count,stars,state
0,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': True, 'validated':...",True,False,"{'dessert': False, 'latenight': False, 'lunch'...",False,average,...,True,True,False,no,KuxDPl6UYNLxFChPm0_MNw,"Cajun/Creole, Southern, Restaurants",Toronto,146,4.0,ON
1,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': False, 'validated'...",False,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,False,True,True,no,6SAfQKe2oM5g_EtcYXyAMg,"Bars, Sports Bars, Dive Bars, Burgers, Nightli...",Phoenix,277,4.0,AZ
2,full_bar,"{'romantic': False, 'intimate': False, 'classy...",False,False,"{'garage': False, 'street': False, 'validated'...",False,False,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,False,True,True,no,upB0RQl-l529IVwgOpwOQQ,"Nightlife, Restaurants, Bars, Chicken Wings, A...",Sun Prairie,102,4.5,WI
3,beer_and_wine,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': False, 'validated'...",False,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,True,True,True,no,TulmRC5V0--dnXYd_GOSvA,"Beer, Wine & Spirits, Italian, Food, American ...",Las Vegas,242,4.0,NV
4,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': False, 'validated'...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,True,True,True,free,yqYtY3-Po4OVPafA9Z-Xyw,"Event Planning & Services, Soup, Salad, Mexica...",Phoenix,230,3.5,AZ
5,,,,,,,,,,,...,,,,,i90S4tfxFm0W2FZnhpJV3A,"Churches, Religious Organizations",Henderson,113,4.0,NV
6,full_bar,"{'romantic': False, 'intimate': False, 'classy...",True,True,"{'garage': False, 'street': True, 'validated':...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",True,average,...,True,True,True,no,xqbvqZHNyj2qExHdizzd0w,"Asian Fusion, Caterers, Fast Food, Chinese, Re...",Pittsburgh,110,3.0,PA
7,none,"{'romantic': False, 'intimate': False, 'classy...",True,False,"{'garage': False, 'street': False, 'validated'...",True,True,"{'dessert': False, 'latenight': False, 'lunch'...",False,average,...,False,True,,no,aXWN4oH8W-MVDchWia084g,"Caribbean, Food, Bakeries, Restaurants",Toronto,105,4.0,ON
8,,,,,,,,,,,...,,,,,a8ACgZ_bPPT6iRQ6R7Ridg,"Pet Groomers, Pet Boarding, Pets, Veterinarian...",Charlotte,103,3.5,NC
9,full_bar,"{'romantic': False, 'intimate': False, 'classy...",False,True,"{'garage': False, 'street': False, 'validated'...",True,False,"{'dessert': False, 'latenight': False, 'lunch'...",True,very_loud,...,True,True,True,free,V6rzs-QgnuW1CdOn-23nNw,"Food, Restaurants, Italian, Bars, Wine Bars, D...",Scottsdale,234,4.0,AZ


###  Helper Functions

##### view_column_values
Helps to view what values occur inside the column of a dataframe

##### expand_dict_to_columns
Sometimes there are columns in the dataframe in which the data is a dictionary string(such as attributes_Ambience). This function helps expand that dictionary string into extra columns with the column being the key and the row content being the value. It returns the modified dataframe.

##### replace_column_nan
There are many Nan in the data. This function replaces the Nan of a specifc column of a dataframe with one of the values that already occur. The third parameter index_of_value_count is used to specify what value to replace, the values which can be viewed using view_column_values


In [6]:
def view_column_values(df, column_name):
    return df[column_name].value_counts()

def expand_dict_to_columns(df, column_name):
    expanded_df = df[column_name].apply(lambda x : dict(eval(x))).apply(pd.Series)
    expanded_df.fillna(False, inplace=True)
    df = pd.concat([df, expanded_df], axis = 1)
    df.drop([column_name], axis=1, inplace=True)
    return df

def replace_column_nan(df, column_name, index_of_value_count):
    df[column_name] = df[column_name].fillna(df[column_name].value_counts().index[index_of_value_count])

In [7]:
view_column_values(business_df, 'stars')

4.0    4258
3.5    3052
4.5    2228
3.0    1389
2.5     487
5.0     416
2.0     151
1.5      64
1.0      13
Name: stars, dtype: int64

In [8]:
business_df['stars'].isnull().sum()

0

#### Replacing all the NaN

In [9]:
business_df_replace['attributes_Ambience'] = business_df_replace['attributes_Ambience'].fillna(ambience_default)
business_df_replace =expand_dict_to_columns(business_df_replace, 'attributes_Ambience')
replace_column_nan(business_df_replace, 'attributes_Alcohol', 0)      # default full_bar, to change to none, change last parameter to 1 
replace_column_nan(business_df_replace, 'attributes_BikeParking', 0)  # default yes parking
replace_column_nan(business_df_replace, 'attributes_BusinessAcceptsCreditCards', 1)  # default True
replace_column_nan(business_df_replace, 'attributes_BusinessParking', 0)  # default just lot parking
business_df_replace = expand_dict_to_columns(business_df_replace, 'attributes_BusinessParking')
replace_column_nan(business_df_replace, 'attributes_Caters', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_GoodForKids', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_HasTV', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_NoiseLevel', 0)  # default Average
replace_column_nan(business_df_replace, 'attributes_OutdoorSeating', 0)  # default True
replace_column_nan(business_df_replace, 'attributes_GoodForMeal', 0)  # default good for lunch and dinner
business_df_replace = expand_dict_to_columns(business_df_replace, 'attributes_GoodForMeal')
replace_column_nan(business_df_replace, 'attributes_RestaurantsAttire', 0)  # default casual
replace_column_nan(business_df_replace, 'attributes_RestaurantsDelivery', 0)  # default false
replace_column_nan(business_df_replace, 'attributes_RestaurantsGoodForGroups', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsPriceRange2', 0)  # default 2$ signs
replace_column_nan(business_df_replace, 'attributes_RestaurantsReservations', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTakeOut', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_WheelchairAccessible', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_WiFi', 0)  # default free
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true
replace_column_nan(business_df_replace, 'attributes_RestaurantsTableService', 0)  # default true

#### Changing categorical input to numerical

In [10]:
for col in business_df_replace.columns:
    if col not in ['business_id', 'stars']: 
        if business_df_replace[col].dtypes == bool:
            # true becomes 1, false becomes 0
            business_df_replace[col] *=1
        elif business_df_replace[col].dtypes != np.dtype('int32') and business_df_replace[col].dtypes != np.dtype('int64') and business_df_replace[col].dtypes != float: 
            #changes categorical values to numerical values
            business_df_replace[col] = business_df_replace[col].astype('category').cat.codes

In [11]:
#note, the business_id column is not numerical, I kept it the same
business_df_replace.isnull().sum()

attributes_Alcohol                       0
attributes_BikeParking                   0
attributes_BusinessAcceptsCreditCards    0
attributes_Caters                        0
attributes_GoodForKids                   0
attributes_HasTV                         0
attributes_NoiseLevel                    0
attributes_OutdoorSeating                0
attributes_RestaurantsAttire             0
attributes_RestaurantsDelivery           0
attributes_RestaurantsGoodForGroups      0
attributes_RestaurantsPriceRange2        0
attributes_RestaurantsReservations       0
attributes_RestaurantsTableService       0
attributes_RestaurantsTakeOut            0
attributes_WheelchairAccessible          0
attributes_WiFi                          0
business_id                              0
categories                               0
city                                     0
review_count                             0
stars                                    0
state                                    0
casual     

In [12]:
# USE THIS
business_df_replace

Unnamed: 0,attributes_Alcohol,attributes_BikeParking,attributes_BusinessAcceptsCreditCards,attributes_Caters,attributes_GoodForKids,attributes_HasTV,attributes_NoiseLevel,attributes_OutdoorSeating,attributes_RestaurantsAttire,attributes_RestaurantsDelivery,...,lot,street,valet,validated,breakfast,brunch,dessert,dinner,latenight,lunch
0,1,1,1,1,0,0,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,1,1,1,0,1,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1
2,1,0,0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1
3,0,1,1,0,1,1,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
4,1,1,1,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
5,1,1,0,1,1,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1
6,1,1,1,1,1,1,0,0,0,1,...,0,1,0,0,0,0,0,1,0,0
7,2,1,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
8,1,1,0,1,1,1,0,1,0,0,...,1,0,0,0,0,0,0,1,0,1
9,1,0,1,1,0,1,3,1,1,0,...,0,0,1,0,0,0,0,1,0,0


## Preprocessing User data
The users are mostly good for numerical features. However the following features 'elite', 'friends', 'name', 'yelping_since' are not but I decided to drop them as they do not intuitively seem super important. 

In [13]:
users_df = pd.read_csv("all/users.csv")
users_df_replace = users_df.copy()
users_df_replace.drop([ 'elite', 'friends', 'name', 'yelping_since'], axis=1, inplace=True)
users_df_replace.columns

Index(['average_stars', 'compliment_cool', 'compliment_cute',
       'compliment_funny', 'compliment_hot', 'compliment_list',
       'compliment_more', 'compliment_note', 'compliment_photos',
       'compliment_plain', 'compliment_profile', 'compliment_writer', 'cool',
       'fans', 'funny', 'review_count', 'useful', 'user_id'],
      dtype='object')

In [14]:
#check that all data is numerical, should output nothing if it is
for col in users_df_replace.columns:
    if col not in ['user_id']: 
        if users_df_replace[col].dtypes != np.dtype('int32') and users_df_replace[col].dtypes != np.dtype('int64') and users_df_replace[col].dtypes != float:
            print(col)

In [15]:
#check that there are no Nan values
users_df_replace.isnull().sum()

average_stars         0
compliment_cool       0
compliment_cute       0
compliment_funny      0
compliment_hot        0
compliment_list       0
compliment_more       0
compliment_note       0
compliment_photos     0
compliment_plain      0
compliment_profile    0
compliment_writer     0
cool                  0
fans                  0
funny                 0
review_count          0
useful                0
user_id               0
dtype: int64

In [16]:
# use this
users_df_replace

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,compliment_profile,compliment_writer,cool,fans,funny,review_count,useful,user_id
0,2.83,0,0,0,0,0,0,0,0,1,0,0,0,0,2,6,7,UxfpKHGO2dfQCdS9xLLJow
1,3.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,Kr5NDQFPPB_01-5CDmSqVg
2,3.09,0,0,0,0,0,0,0,0,0,0,0,0,1,0,10,2,wfoeMtriLwZsdRzcxNTaFA
3,4.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,aXb0kCIsIbPEEUSGomrrmA
4,4.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,sLrX2KGu3lc_JczAnsg0_Q
5,3.33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,3,nmYitfmo-pQ1hJWDnTLwGg
6,4.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,5tm0BfJEWGJWowr3sPGb8Q
7,1.50,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,PzHuq79aP6G25kEv-hejOA
8,1.00,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,GVYg18F-Rkuk63hvtHoG5Q
9,2.33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0IqKVB1rbaDyz0wlefmiAA


## Preprocessing Review Data

In [17]:
sample_submission = pd.read_csv("all/sample_submission.csv")
train_reviews = pd.read_csv("all/train_reviews.csv")
train_reviews = train_reviews[['user_id', 'business_id', 'stars']]
train_reviews

Unnamed: 0,user_id,business_id,stars
0,VDh1vjzpNUJH6HfcjH8g7Q,WPCgtEG-bJt0cZtnM-x7yw,4.0
1,HnnjIuLrdhLTsRRVrrFIjA,LnnO7quTjjdTUkCshSJnkA,5.0
2,HnnjIuLrdhLTsRRVrrFIjA,sKrlmbrZWCyLIgiMihCPqw,5.0
3,HnnjIuLrdhLTsRRVrrFIjA,Lh5qnT2m2b4lvyYiMGMDkg,4.0
4,HnnjIuLrdhLTsRRVrrFIjA,54LYVM1gCGQ2UVFK9QhgTw,5.0
5,HnnjIuLrdhLTsRRVrrFIjA,08-b4GbZxOzzo9XSJsR-tw,5.0
6,2FXuEqmoQUyyzRFH9_Je0Q,6NG_A-epYEpsJSugfAaTRQ,5.0
7,HnnjIuLrdhLTsRRVrrFIjA,8FqfLM0Kv3Grr9l8bOAlCA,4.0
8,HnnjIuLrdhLTsRRVrrFIjA,O_UC_izJXcAmkm6HlEyGSA,4.0
9,TZRCpxTnEWEaiKXeqF_7ng,U-a61zpbsDNVtKm9W1aqLw,4.0


In [18]:
train_reviews_replace = train_reviews.copy()
reviews_denorm = pd.merge(train_reviews_replace, users_df_replace, how='left', on='user_id')
reviews_denorm = pd.merge(reviews_denorm, business_df_replace, how='left', on='business_id')
reviews_denorm = reviews_denorm.rename(columns={'stars_x': 'review_stars', 'stars_y': 'business_stars'})
train_df_y = reviews_denorm['review_stars']
train_df_x = reviews_denorm.copy()
train_df_x.drop(['review_stars', 'business_id', 'user_id'], axis=1, inplace=True)
train_df_y

0         4.0
1         5.0
2         5.0
3         4.0
4         5.0
5         5.0
6         5.0
7         4.0
8         4.0
9         4.0
10        5.0
11        2.0
12        4.0
13        5.0
14        5.0
15        3.0
16        5.0
17        5.0
18        4.0
19        3.0
20        1.0
21        5.0
22        5.0
23        4.0
24        5.0
25        5.0
26        3.0
27        4.0
28        3.0
29        5.0
         ... 
150202    3.0
150203    5.0
150204    5.0
150205    5.0
150206    3.0
150207    4.0
150208    5.0
150209    1.0
150210    5.0
150211    3.0
150212    2.0
150213    5.0
150214    3.0
150215    3.0
150216    5.0
150217    4.0
150218    5.0
150219    5.0
150220    4.0
150221    5.0
150222    4.0
150223    5.0
150224    5.0
150225    5.0
150226    3.0
150227    3.0
150228    5.0
150229    4.0
150230    5.0
150231    4.0
Name: review_stars, Length: 150232, dtype: float64

In [19]:
from knn import KNN
knn = KNN()
knn.train(train_df_x.values, train_df_y.values)


In [20]:
reviews_denorm.groupby('review_stars').size()

review_stars
1.0     8833
2.0    13365
3.0    28302
4.0    53634
5.0    46098
dtype: int64

In [21]:
validate_df = pd.read_csv("all/validate_queries.csv")
validate_df_denorm = pd.merge(validate_df, users_df_replace, how='left', on='user_id')
validate_df_denorm = pd.merge(validate_df_denorm, business_df_replace, how='left', on='business_id')
validate_df_denorm = validate_df_denorm.rename(columns={'stars_x': 'review_stars', 'stars_y': 'business_stars'})
test_df_y = validate_df_denorm['review_stars']
test_df_x = validate_df_denorm.copy()
test_df_x.drop(['Unnamed: 0', 'review_stars', 'business_id', 'user_id'], axis=1, inplace=True)
test_df_x

Unnamed: 0,average_stars,compliment_cool,compliment_cute,compliment_funny,compliment_hot,compliment_list,compliment_more,compliment_note,compliment_photos,compliment_plain,...,lot,street,valet,validated,breakfast,brunch,dessert,dinner,latenight,lunch
0,2.63,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,1,1
1,3.78,4,0,4,2,0,0,14,1,12,...,0,0,0,0,0,0,0,1,0,0
2,3.48,5,0,5,2,0,1,8,1,4,...,0,0,0,0,1,1,0,0,1,1
3,3.47,13,0,13,6,0,2,1,14,5,...,0,1,0,0,0,1,1,0,0,1
4,4.13,2,0,2,0,0,0,1,2,0,...,1,0,0,0,0,0,0,1,1,1
5,3.77,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,1
6,3.60,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,1
7,3.31,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,1,0,1,0,1
8,3.97,1,0,1,2,0,0,2,1,3,...,1,0,0,0,0,0,0,1,0,1
9,3.67,3,0,3,0,0,0,5,1,3,...,1,0,0,0,1,0,1,0,0,0


In [22]:
set(train_df_x.columns)-set(test_df_x.columns)


set()

In [23]:
set(test_df_x.columns)-set(train_df_x.columns)

set()

In [29]:
from sklearn.neighbors import KNeighborsClassifier  
classifier = KNeighborsClassifier(n_neighbors=5) 
classifier.fit(train_df_x.values, train_df_y.values)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [30]:
y_pred = classifier.predict(test_df_x)


In [31]:
from sklearn.metrics import classification_report, confusion_matrix 
print(classification_report(test_df_y.values, y_pred))

              precision    recall  f1-score   support

         1.0       0.20      0.18      0.19      4139
         2.0       0.09      0.07      0.08      3850
         3.0       0.19      0.21      0.20      7147
         4.0       0.33      0.42      0.37     14139
         5.0       0.51      0.43      0.47     20802

   micro avg       0.35      0.35      0.35     50077
   macro avg       0.26      0.26      0.26     50077
weighted avg       0.36      0.35      0.35     50077

