In [1]:
import os
import re
import pandas as pd
import numpy as np
import random

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Load Files

In [2]:
kn_reviews_df = pd.read_json("../businesses_neighbours_reviews.json",lines=True)
kn_reviews_df.head()

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,5e9d3052a2d166c8b8035d16,g2X8qjl7tm8ssDEdssmbkQ,q3qW_NPK-pum82QoNKsIkw,CCzhIWR_Y1OA4DVENNQfdg,1,0,1,0,When I returned home last night there were sev...,2014-04-04 21:40:55
1,5e9d3067050fff0a09bb2100,CKY-x13MTj57V7lotf42hQ,7SewGeL46DU4JLzH929qSA,CCzhIWR_Y1OA4DVENNQfdg,4,1,0,0,Waters Edge isn't a bad place to live. The nei...,2013-12-31 04:54:11
2,5e9d3098557511c76dd0f297,yfSQ3wkWjheDMyxy4WXQnQ,-rHf8BoWEHEUp6V7fZ8N7g,CCzhIWR_Y1OA4DVENNQfdg,1,6,3,1,I have been a tenant here for going on 2 years...,2014-10-14 02:16:15
3,5e9d309ed26d55e84aafebd4,8CtjhFVRzleoWsBEzHHKRQ,mXe0QQyqfD-xrKJy9tHfiA,CCzhIWR_Y1OA4DVENNQfdg,3,0,0,0,"Other than the mold on window sills, it's not ...",2019-05-03 18:16:04
4,5e9d3101514c9fe943231703,zLcrl2baLeamoUB6ayURuQ,A9penMv8g6xNfxbjI0lx7A,5fPIYHSdQNDfZZv7BaDgSw,4,0,0,1,My experience at this complex has been very po...,2018-04-12 20:55:21


# Build TF-IDF for reviews

In [3]:
reviews_texts=kn_reviews_df['text']

### Clean especial characters

In [4]:
clear_text_list=[]
for x in reviews_texts:
    y=re.sub(r'[,.!-?¿¡"&$%#\n\t]','',x.lower())
    clear_text_list.append(y)
clear_text_list[:5]

['when i returned home last night there were seven marked and one unmarked police cars in front if my building i called the office to see what was going on today and was told it was because people were hanging out in their cars and playing music loudlyim pretty sure a complex manager just lied to me i missed work because they never plowed the roadway when we got that  inches of snow ill never suggest anyone to live here and if it werent such a pain to move i live here alone few friends no one to help id get out at the end of my leaseupdate july   it was a domestic dispute that happened the night i originally posted they didnt find out what it actually was until they got their monthly report from the sheriffs department so they didnt lie per se they were just guessing',
 'waters edge isnt a bad place to live the neighbors are friendly the apartments arent top of the line but theyre good for the price you pay maintenance has always been helpful quick to respond to any issues we may have 

### Se crea una matrix de conteo de (textos x palabras), se ignoran las palabras que aparezcan en el 85% de los documentos(no relevantes)

In [5]:
cv=CountVectorizer(max_df=0.85, stop_words="english")
word_count_vector=cv.fit_transform(clear_text_list)

### Se convierte la matriz dispersa a dataframe

In [6]:
keywords_df = pd.DataFrame(word_count_vector.toarray())

In [7]:
keywords_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [30]:
#Get the number of occurrences of a keyword in all reviews
lista_de_tuplas=list(zip(cv.get_feature_names(),keywords_df.sum().to_list()))
#sort
lista_de_tuplas.sort(key=lambda tup: tup[1], reverse=True)
list_of_lists = [list(elem) for elem in lista_de_tuplas]
list_of_lists [:10]

[['apartment', 70],
 ['place', 49],
 ['management', 37],
 ['just', 33],
 ['live', 32],
 ['office', 32],
 ['dont', 31],
 ['water', 27],
 ['maintenance', 26],
 ['months', 26]]

In [9]:
#get_feature_names retorna el nombre de las entidades, no el indice, lista todas las keywords encontradas
feature_names=cv.get_feature_names()
feature_names[:5]

['able', 'absolute', 'absolutely', 'ac', 'acceptable']

In [10]:
#Calculo de TF- IDF sobre la matriz dispersa, smooth_idf modifica la formula matematica False para no ignorar completamente los terminos que aparecen en todos los textos
#Se utiliza normalizacion coseno
#use_idf true para calcular la ponderacion inversa de frecuencia
Tfidf_transformer=TfidfTransformer(smooth_idf=False,use_idf=True)
Tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True)

In [11]:
keyword_tf_idf_df = pd.DataFrame(Tfidf_transformer.transform(word_count_vector).toarray())
keyword_tf_idf_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.163369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.079513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109375,0.0,0.0


In [12]:
def sort_coo(coo_matrix):
    tuples=zip(coo_matrix.col,coo_matrix.data)
    return sorted(tuples,key=lambda x:(x[1],x[0]),reverse=True)

#Extra de todas las keywords las n-keywords mas relevantes(TF-IDF)
def extract_topn_from_vector(feature_names, sorted_items,topn=10):
    sorted_items=sorted_items[:topn]
    score_vals=[]
    feature_vals=[]   
    for idx,score in sorted_items:
        score_vals.append(round(score,3))
        feature_vals.append(feature_names[idx])
    results={}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    return results 

In [13]:
#Calculo de TF-IDF y encontrar top 10 de keywords para cada texto de las reviews de los vecinos
keywordsArray=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    keywordsArray.append(keywords)
keywordsArray[:5]

[{'cars': 0.238,
  'night': 0.197,
  'got': 0.16,
  'didnt': 0.155,
  'unmarked': 0.152,
  'snow': 0.152,
  'sheriffs': 0.152,
  'se': 0.152,
  'roadway': 0.152,
  'report': 0.152},
 {'theyre': 0.291,
  'stocked': 0.225,
  'snappers': 0.225,
  'pond': 0.225,
  'fish': 0.225,
  'edge': 0.225,
  'beware': 0.225,
  'baby': 0.225,
  'waters': 0.194,
  'renovated': 0.194},
 {'health': 0.603,
  'mold': 0.313,
  'tenant': 0.165,
  'upkeep': 0.121,
  'severe': 0.121,
  'seeking': 0.121,
  'sacrifice': 0.121,
  'respiratory': 0.121,
  'politicians': 0.121,
  'photos': 0.121},
 {'mold': 0.494,
  'wanna': 0.286,
  'sills': 0.286,
  'luxury': 0.286,
  'expensive': 0.286,
  'best': 0.224,
  'quiet': 0.208,
  'price': 0.208,
  'family': 0.208,
  'window': 0.195},
 {'positive': 0.409,
  'minor': 0.409,
  'resolved': 0.321,
  'orders': 0.321,
  'responsive': 0.297,
  'experience': 0.265,
  'couple': 0.252,
  'issues': 0.242,
  'days': 0.216,
  'work': 0.196}]

In [14]:
reviews_keywords=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    reviews_keywords.append( list( keywords.keys() ) )
reviews_keywords[:5]

[['cars',
  'night',
  'got',
  'didnt',
  'unmarked',
  'snow',
  'sheriffs',
  'se',
  'roadway',
  'report'],
 ['theyre',
  'stocked',
  'snappers',
  'pond',
  'fish',
  'edge',
  'beware',
  'baby',
  'waters',
  'renovated'],
 ['health',
  'mold',
  'tenant',
  'upkeep',
  'severe',
  'seeking',
  'sacrifice',
  'respiratory',
  'politicians',
  'photos'],
 ['mold',
  'wanna',
  'sills',
  'luxury',
  'expensive',
  'best',
  'quiet',
  'price',
  'family',
  'window'],
 ['positive',
  'minor',
  'resolved',
  'orders',
  'responsive',
  'experience',
  'couple',
  'issues',
  'days',
  'work']]

In [15]:
df_reviews_keywords = pd.DataFrame(reviews_keywords)
df_reviews_keywords

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,cars,night,got,didnt,unmarked,snow,sheriffs,se,roadway,report
1,theyre,stocked,snappers,pond,fish,edge,beware,baby,waters,renovated
2,health,mold,tenant,upkeep,severe,seeking,sacrifice,respiratory,politicians,photos
3,mold,wanna,sills,luxury,expensive,best,quiet,price,family,window
4,positive,minor,resolved,orders,responsive,experience,couple,issues,days,work
5,took,unreliable,shooting,raising,protection,inconsiderate,handicapped,closing,chris,blocks
6,phone,day,management,slipshod,okay,mandel,informed,employees,effort,contact
7,fob,renovated,plumbing,problems,save,paid,apartment,worst,amenities,roaches
8,rats,possibly,pane,mice,idiots,homeless,heat,agreement,adding,rent
9,till,maxine,damn,window,customer,tenants,didnt,yucca,yelp,unbearable


In [16]:
reviews_keywords_scores=[]
for text in clear_text_list:
    tf_idf_vector=Tfidf_transformer.transform(cv.transform([text]))
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    keywords=extract_topn_from_vector(feature_names,sorted_items,10)
    reviews_keywords_scores.append( list( keywords.values() ))
reviews_keywords_scores[:5]

[[0.238, 0.197, 0.16, 0.155, 0.152, 0.152, 0.152, 0.152, 0.152, 0.152],
 [0.291, 0.225, 0.225, 0.225, 0.225, 0.225, 0.225, 0.225, 0.194, 0.194],
 [0.603, 0.313, 0.165, 0.121, 0.121, 0.121, 0.121, 0.121, 0.121, 0.121],
 [0.494, 0.286, 0.286, 0.286, 0.286, 0.224, 0.208, 0.208, 0.208, 0.195],
 [0.409, 0.409, 0.321, 0.321, 0.297, 0.265, 0.252, 0.242, 0.216, 0.196]]

In [17]:
kn_reviews_df['useful_mean'] = kn_reviews_df['useful'].mean()
kn_reviews_df['is_useful'] = kn_reviews_df['useful'] > kn_reviews_df['useful_mean']

In [18]:
kn_reviews_df

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,useful_mean,is_useful
0,5e9d3052a2d166c8b8035d16,g2X8qjl7tm8ssDEdssmbkQ,q3qW_NPK-pum82QoNKsIkw,CCzhIWR_Y1OA4DVENNQfdg,1,0,1,0,When I returned home last night there were sev...,2014-04-04 21:40:55,2.423729,False
1,5e9d3067050fff0a09bb2100,CKY-x13MTj57V7lotf42hQ,7SewGeL46DU4JLzH929qSA,CCzhIWR_Y1OA4DVENNQfdg,4,1,0,0,Waters Edge isn't a bad place to live. The nei...,2013-12-31 04:54:11,2.423729,False
2,5e9d3098557511c76dd0f297,yfSQ3wkWjheDMyxy4WXQnQ,-rHf8BoWEHEUp6V7fZ8N7g,CCzhIWR_Y1OA4DVENNQfdg,1,6,3,1,I have been a tenant here for going on 2 years...,2014-10-14 02:16:15,2.423729,True
3,5e9d309ed26d55e84aafebd4,8CtjhFVRzleoWsBEzHHKRQ,mXe0QQyqfD-xrKJy9tHfiA,CCzhIWR_Y1OA4DVENNQfdg,3,0,0,0,"Other than the mold on window sills, it's not ...",2019-05-03 18:16:04,2.423729,False
4,5e9d3101514c9fe943231703,zLcrl2baLeamoUB6ayURuQ,A9penMv8g6xNfxbjI0lx7A,5fPIYHSdQNDfZZv7BaDgSw,4,0,0,1,My experience at this complex has been very po...,2018-04-12 20:55:21,2.423729,False
5,5e9d3107f221495a0188fa6e,3oEVcAuhSvRz0F3WsdqXYQ,w-oFxLDJhC_N_zBZr-XnAg,5fPIYHSdQNDfZZv7BaDgSw,1,1,0,0,This place used to be awesome until Chris took...,2016-05-17 05:37:34,2.423729,False
6,5e9d3107f221495a01890e4f,Jdw8XpOW-I8VZGttte6q8A,zigZr8sC7JfVO1X8I0Wx2A,5fPIYHSdQNDfZZv7BaDgSw,1,2,3,1,The place might be an okay to live in but the ...,2014-02-23 00:17:21,2.423729,False
7,5e9d311c1afade628798f93d,u9V4mO4oIiz3MJzOGmvYMQ,FT4j9J3bf0nUjFJ_xuhjow,21aC37_3omT8i52fEQBibQ,1,1,0,0,The worst apartment complex ever. Paid for ame...,2019-11-14 10:44:43,2.423729,False
8,5e9d311d1afade6287999b23,OCNpjaHn3ZVVAlIofJ9wsQ,MAQfd-0xvmb2-s4Wh6VkOA,21aC37_3omT8i52fEQBibQ,1,1,0,0,DO NOT MOVE IN TO THE TIDES!! They lie about y...,2019-11-02 04:59:47,2.423729,False
9,5e9d311d1afade628799aaa6,SFbcO9V9KtWfp3U4y-hlng,cAiPiS6efd1ioVeoKyvsog,21aC37_3omT8i52fEQBibQ,1,1,0,0,Worst place I've ever had the displeasure of l...,2019-12-10 21:23:47,2.423729,False


In [19]:
kn_reviews_df['useful'].describe()

count    59.000000
mean      2.423729
std       3.296829
min       0.000000
25%       0.000000
50%       1.000000
75%       3.000000
max      13.000000
Name: useful, dtype: float64

In [20]:
kn_reviews_df['is_useful'].describe()

count        59
unique        2
top       False
freq         41
Name: is_useful, dtype: object

In [21]:
reviews_keywords_df = pd.DataFrame(reviews_keywords)
reviews_keywords_df = pd.concat([kn_reviews_df['review_id'], kn_reviews_df['is_useful'], reviews_keywords_df], axis=1)
reviews_keywords_df

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,8,9
0,g2X8qjl7tm8ssDEdssmbkQ,False,cars,night,got,didnt,unmarked,snow,sheriffs,se,roadway,report
1,CKY-x13MTj57V7lotf42hQ,False,theyre,stocked,snappers,pond,fish,edge,beware,baby,waters,renovated
2,yfSQ3wkWjheDMyxy4WXQnQ,True,health,mold,tenant,upkeep,severe,seeking,sacrifice,respiratory,politicians,photos
3,8CtjhFVRzleoWsBEzHHKRQ,False,mold,wanna,sills,luxury,expensive,best,quiet,price,family,window
4,zLcrl2baLeamoUB6ayURuQ,False,positive,minor,resolved,orders,responsive,experience,couple,issues,days,work
5,3oEVcAuhSvRz0F3WsdqXYQ,False,took,unreliable,shooting,raising,protection,inconsiderate,handicapped,closing,chris,blocks
6,Jdw8XpOW-I8VZGttte6q8A,False,phone,day,management,slipshod,okay,mandel,informed,employees,effort,contact
7,u9V4mO4oIiz3MJzOGmvYMQ,False,fob,renovated,plumbing,problems,save,paid,apartment,worst,amenities,roaches
8,OCNpjaHn3ZVVAlIofJ9wsQ,False,rats,possibly,pane,mice,idiots,homeless,heat,agreement,adding,rent
9,SFbcO9V9KtWfp3U4y-hlng,False,till,maxine,damn,window,customer,tenants,didnt,yucca,yelp,unbearable


In [22]:
reviews_keywords_scores_df = pd.DataFrame(reviews_keywords_scores).fillna(0)
#reviews_keywords_scores_df = pd.concat([kn_reviews_df['is_useful'], reviews_keywords_scores_df], axis=1)
reviews_keywords_scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.238,0.197,0.16,0.155,0.152,0.152,0.152,0.152,0.152,0.152
1,0.291,0.225,0.225,0.225,0.225,0.225,0.225,0.225,0.194,0.194
2,0.603,0.313,0.165,0.121,0.121,0.121,0.121,0.121,0.121,0.121
3,0.494,0.286,0.286,0.286,0.286,0.224,0.208,0.208,0.208,0.195
4,0.409,0.409,0.321,0.321,0.297,0.265,0.252,0.242,0.216,0.196
5,0.224,0.219,0.219,0.219,0.219,0.219,0.219,0.219,0.219,0.219
6,0.246,0.245,0.216,0.18,0.18,0.18,0.18,0.18,0.18,0.18
7,0.38,0.328,0.328,0.298,0.276,0.276,0.256,0.235,0.225,0.216
8,0.226,0.226,0.226,0.226,0.226,0.226,0.226,0.226,0.226,0.217
9,0.219,0.219,0.171,0.149,0.142,0.12,0.112,0.109,0.109,0.109


## Find reviews for a business

In [23]:
keywords_df_with_class = pd.concat([kn_reviews_df['review_id'], kn_reviews_df['is_useful'], keyword_tf_idf_df], axis=1)
keywords_df_with_class

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,...,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
0,g2X8qjl7tm8ssDEdssmbkQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,CKY-x13MTj57V7lotf42hQ,False,0.163369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,yfSQ3wkWjheDMyxy4WXQnQ,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8CtjhFVRzleoWsBEzHHKRQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,zLcrl2baLeamoUB6ayURuQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3oEVcAuhSvRz0F3WsdqXYQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Jdw8XpOW-I8VZGttte6q8A,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,u9V4mO4oIiz3MJzOGmvYMQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,OCNpjaHn3ZVVAlIofJ9wsQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,SFbcO9V9KtWfp3U4y-hlng,False,0.079513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109375,0.0,0.0


In [24]:
#Con esta configuración se utilizan los 3 vecinos más cercanos, con distancia euclidiana
knn_clasif=KNeighborsClassifier(3)

In [25]:
# Fit recibe la matriz de entrenamiento y la clase objetivo
knn_clasif.fit(keyword_tf_idf_df, keywords_df_with_class['is_useful'])

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [26]:
# llamamos predict sobre  los test , creando una nueva columna en el dataframe de test
keywords_df_with_class['predict']=knn_clasif.predict(keyword_tf_idf_df)
keywords_df_with_class

Unnamed: 0,review_id,is_useful,0,1,2,3,4,5,6,7,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,predict
0,g2X8qjl7tm8ssDEdssmbkQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
1,CKY-x13MTj57V7lotf42hQ,False,0.163369,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
2,yfSQ3wkWjheDMyxy4WXQnQ,True,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
3,8CtjhFVRzleoWsBEzHHKRQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,zLcrl2baLeamoUB6ayURuQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
5,3oEVcAuhSvRz0F3WsdqXYQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
6,Jdw8XpOW-I8VZGttte6q8A,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
7,u9V4mO4oIiz3MJzOGmvYMQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
8,OCNpjaHn3ZVVAlIofJ9wsQ,False,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
9,SFbcO9V9KtWfp3U4y-hlng,False,0.079513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.109375,0.0,0.0,False


In [27]:
tn, fp, fn, tp = confusion_matrix(keywords_df_with_class['is_useful'],keywords_df_with_class['predict'], labels=[False,True]).ravel()

In [28]:
print(tn)
print(fp)
print(fn)
print(tp)

37
4
3
15


In [29]:
precision_recall_fscore_support(keywords_df_with_class['is_useful'],keywords_df_with_class['predict'], pos_label=True,average='binary')

(0.7894736842105263, 0.8333333333333334, 0.8108108108108109, None)