In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import operator

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb

# datasets
df = pd.read_csv('product_level_data_with_img_feats.csv.gz')
df_text = pd.read_csv('product_level_data_text_feats.csv.gz')

# features
review_features = ['tfidf_review_body', 'n_of_reviews','avg_review_rating',
                   'avg_days_between_reviews', 'stdev_days_between_reviews',
                   'max_days_between_reviews', 'min_days_between_reviews',
                   'share_helpful_reviews', 'share_1star', 'share_5star', 'share_photo', 'std_review_len']
network_features = ['pagerank', 'w_degree', 'clustering_coef', 'eigenvector_cent']
image_sim_features = ['min_sim', 'max_sim', 'mean_sim', 'std_sim', 'min_sim_review', 'max_sim_review',
       'mean_sim_review', 'std_sim_review', 'min_sim_product',
       'max_sim_product', 'mean_sim_product', 'std_sim_product']

# correlation matrix
# corr_table = df[review_features + network_features + image_sim_features].corr()
# corr_table.to_csv(path + 'Amazon Review Data/corr_table.csv')

############################## FUNCTIONS
def model_building(X_train, y_train, X_test, y_test, model):

	model.fit(X_train, y_train)

	y_pred = model.predict(X_test)
	cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
	probs = model.predict_proba(X_test)[:,1]

	# print(cm)
	print("AUC, Accuracy, TN, TP, F1 Score")
	print("{}, {}, {}, {}, {}".format(metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
															  sum(cm.diagonal()) / X_test.shape[0],
															  cm[0,0] / sum(cm[0,:]),
															  cm[1,1] / sum(cm[1,:]),
															  metrics.f1_score(y_test, y_pred, average='weighted')))

	return probs

def classification_results(df, features=None, stars=None):

	if features == None:
		X = df.drop(['product_ID','fake'], axis=1)
		features = list(X.columns)
		y = df['fake']
	else:
		X = df[features]
		y = df['fake']

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
	print(X_train.shape, X_test.shape)

	scaler = StandardScaler()
	X_train = scaler.fit_transform(X_train)
	X_test = scaler.transform(X_test)

	print("="*10 + "Logistic Regression" + "="*10)
	model = LogisticRegression(max_iter=400)
	model_building(X_train, y_train, X_test, y_test, model)

	print("="*10 + "Random Forest" + "="*10)
	model = RandomForestClassifier(random_state=42,
	                               n_estimators=100,
	                               min_samples_leaf=3,
	                               min_samples_split=6,
	                               max_features='sqrt',
	                               max_depth=40,
	                               bootstrap=True,
	                               n_jobs=-1)
	model_building(X_train, y_train, X_test, y_test, model)

	print("="*10 + "RF Feature Importance" + "="*10)
	imps = model.feature_importances_
	feat_imp = {features[i]: imps[i] for i in range(len(features))}

	if len(features) > 100:
		print(sorted(feat_imp.items(), key=operator.itemgetter(1), reverse=True)[:50])
	else:
		print(sorted(feat_imp.items(), key=operator.itemgetter(1), reverse=True))

	print("="*10 + "SVC Linear" + "="*10)
	model = SVC(kernel='linear', probability=True)
	model_building(X_train, y_train, X_test, y_test, model)

	print("="*10 + "XGBoost" + "="*10)
	model = xgb.XGBClassifier()
	model_building(X_train, y_train, X_test, y_test, model)

	return

####################### RESULTS
# review features
print("\n+++++++++++++++++ Review Features ++++++++++++++++")
classification_results(df, review_features)

# image features
print("\n+++++++++++++++++ Image Features ++++++++++++++++\n")
classification_results(df, image_sim_features)

# network features
print("\n+++++++++++++++++ Network Features ++++++++++++++++\n")
classification_results(df, network_features)

# top-2 network features
print("\n+++++++++++++++++ Top 2 Network Features ++++++++++++++++\n")
classification_results(df, ['eigenvector_cent', 'clustering_coef'])

# all features
print("\n+++++++++++++++++ All Features ++++++++++++++++\n")
classification_results(df, review_features+image_sim_features+network_features)

# all text features
print("\n+++++++++++++++++ All Text ++++++++++++++++\n")
classification_results(df_text)


+++++++++++++++++ Review Features ++++++++++++++++
(2661, 12) (666, 12)
AUC, Accuracy, TN, TP, F1 Score
0.8381394920868607, 0.7852852852852853, 0.8447368421052631, 0.7062937062937062, 0.7837976660156704
AUC, Accuracy, TN, TP, F1 Score
0.8739786529260214, 0.8108108108108109, 0.8578947368421053, 0.7482517482517482, 0.8099259041135368
[('share_photo', np.float64(0.20188060207897046)), ('max_days_between_reviews', np.float64(0.12458869568691879)), ('n_of_reviews', np.float64(0.11969511835789111)), ('share_5star', np.float64(0.09613061263469783)), ('avg_days_between_reviews', np.float64(0.09050997532436836)), ('stdev_days_between_reviews', np.float64(0.0794121323652325)), ('tfidf_review_body', np.float64(0.0730594402122357)), ('avg_review_rating', np.float64(0.05466385371918878)), ('std_review_len', np.float64(0.053297265625993115)), ('share_helpful_reviews', np.float64(0.05276730351485979)), ('share_1star', np.float64(0.049645853955146806)), ('min_days_between_reviews', np.float64(0.00434

In [None]:
import pandas as pd
import networkx as nx
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

################################# DATASETS
# reviews
from google.colab import drive
drive.mount('/content/drive')
file_path = '/content/drive/My Drive/UCSD_home_and_kitchen_reviews.csv'
reviews = pd.read_csv(file_path)

print("Number of products, reviews, and reviewers in reviews dataset:", \
				len(reviews.product_ID.unique()),\
				reviews.shape[0],\
				len(reviews.reviewer_ID.unique()))

# UCSD product level data
df_ucsd = pd.read_csv('UCSD_product_level_data.csv.gz')

# our data
df_ours = pd.read_csv('product_level_data_without_img_feats.csv.gz')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Number of products, reviews, and reviewers in reviews dataset: 64585 11045767 6130417


In [None]:
'''import pandas as pd
import networkx as nx
import scipy

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

################################# DATASETS
# reviews
reviews = pd.read_csv('UCSD_home_and_kitchen_reviews.csv.gz')

print("Number of products, reviews, and reviewers in reviews dataset:", \
				len(reviews.product_ID.unique()),\
				reviews.shape[0],\
				len(reviews.reviewer_ID.unique()))

# UCSD product level data
df_ucsd = pd.read_csv('UCSD_product_level_data.csv.gz')

# our data
df_ours = pd.read_csv('product_level_data_without_img_feats.csv.gz')'''

############################ FUNCTIONS

def scaling_data(df, features):
	scaler = StandardScaler()
	X = scaler.fit_transform(df[features])
	return X

def weighted_projected_graph(B, nodes, ratio=False):
    if B.is_directed():
        pred = B.pred
        G = nx.DiGraph()
    else:
        pred = B.adj
        G = nx.Graph()
    G.graph.update(B.graph)
    G.add_nodes_from((n, B.nodes[n]) for n in nodes)
    n_top = float(len(B) - len(nodes))
    nodes_checked = []
    for u in nodes:
        nodes_checked.append(u)
        unbrs = set(B[u])
        nbrs2 = {n for nbr in unbrs for n in B[nbr]} - set(nodes_checked)
        for v in nbrs2:
            vnbrs = set(pred[v])
            common = unbrs & vnbrs
            if not ratio:
                weight = len(common)
            else:
                weight = len(common) / n_top
            G.add_edge(u, v, weight=weight)
    return G

def obtain_network_features(reviews):

	# initializing the product-level data
	df = pd.DataFrame({"product_ID": reviews.product_ID.unique()})

	# building the bipartite product-reviewer graph
	B = nx.Graph()
	B.add_nodes_from(reviews.reviewer_ID, bipartite=0)
	B.add_nodes_from(reviews.product_ID, bipartite=1)
	B.add_edges_from([(row['reviewer_ID'], row['product_ID']) for idx, row in reviews.iterrows()])

	# building the product projected graph
	P = weighted_projected_graph(B, reviews.product_ID.unique())

	w_degree_cent = nx.degree(P, weight='weight')
	eig_cent = nx.eigenvector_centrality(P, max_iter=500)
	pr = nx.pagerank(P, alpha=0.85)
	cc = nx.clustering(P)

	# creating the features data
	df['pagerank'] = [pr[i] for i in df.product_ID]
	df['eigenvector_cent'] = [eig_cent[i] for i in df.product_ID]
	df['clustering_coef'] = [cc[i] for i in df.product_ID]
	df['w_degree'] = [w_degree_cent[i] for i in df.product_ID]

	return df

def classification_results(df_train, df_test, features):

	X_train = df_train[features].values
	y_train = df_train['fake'].values
	X_test = df_test[features].values

	# scaler = StandardScaler()
	# X_train = scaler.fit_transform(X_train)
	# X_test = scaler.transform(X_test)
	print("Shape of train and test:",X_train.shape, X_test.shape)

	model = RandomForestClassifier(random_state=42,
	                               n_estimators=1200,
	                               min_samples_leaf=3,
	                               min_samples_split=6,
	                               max_features='sqrt',
	                               max_depth=40,
	                               bootstrap=True,
	                               n_jobs=-1)
	model.fit(X_train, y_train)
	y_prob_pred = model.predict_proba(X_test)[:,1]
	print(sum(y_prob_pred >= 0.5), sum(y_prob_pred >= 0.6), sum(y_prob_pred >= 0.7))

	df_test['p_fake'] = y_prob_pred
	return df_test

################################## CLUSTERING
review_features = ['tfidf_review_body', 'n_of_reviews','avg_review_rating',
                   'avg_days_between_reviews', 'stdev_days_between_reviews',
                   'max_days_between_reviews', 'min_days_between_reviews',
                   'share_helpful_reviews', 'share_1star', 'share_5star', 'share_photo', 'std_review_len']
network_features = ['pagerank', 'w_degree', 'clustering_coef', 'eigenvector_cent']

features_to_use = review_features + network_features

X = scaling_data(df_ucsd, features_to_use)
k = 20
method = KMeans(n_clusters=k, random_state=42).fit(X)
labels = method.labels_
df_ucsd['cluster_ID'] = labels + 1
print(df_ucsd.groupby('cluster_ID')['product_ID'].count())

################################# CLASSIFICATION ON CLUSTERS
frames = []
for i in range(k):

	print("================ CLUSTER {}====================".format(i+1))
	# obtain the network features
	df_network = obtain_network_features(reviews.loc[reviews.product_ID.isin(df_ucsd.loc[df_ucsd.cluster_ID == i+1,'product_ID'].values), :])

	# obtain all features
	df = df_network[['product_ID'] + network_features].merge(df_ucsd[review_features+['product_ID']], on='product_ID', how='inner')

	# classify
	df_with_p_fake = classification_results(df_ours, df, features=features_to_use)

	# append the data
	frames.append(df_with_p_fake)

# combining all clusters in one df
clusters = pd.concat(frames, axis=0, ignore_index=True)
clusters = clusters.merge(df_ucsd[['product_ID', 'cluster_ID']], on='product_ID', how='inner')

################################ RESULTS

clusters_pt = clusters.pivot_table(index='cluster_ID', aggfunc={'clustering_coef': 'mean',
																'eigenvector_cent': 'mean',
																'share_photo': 'mean',
																'w_degree': 'mean',
																'n_of_reviews': 'mean',
																'max_days_between_reviews':'mean',
																'pagerank':'mean',
																'share_5star':'mean',
																'tfidf_review_body':'mean', 'avg_days_between_reviews':'mean', 'stdev_days_between_reviews':'mean', 'avg_review_rating':'mean',
																'std_review_len':'mean', 'share_1star':'mean', 'share_helpful_reviews':'mean',
																'min_days_between_reviews':'mean', 'product_ID':'count', 'p_fake':lambda x:(x>=0.5).sum(),})
clusters_pt[review_features + network_features] = scipy.stats.zscore(clusters_pt[review_features + network_features])
clusters_pt = clusters_pt.reindex(['clustering_coef','eigenvector_cent',
									'share_photo','w_degree','n_of_reviews','max_days_between_reviews',
									'pagerank','share_5star','tfidf_review_body','avg_days_between_reviews',
									'stdev_days_between_reviews','avg_review_rating','std_review_len',
									'share_1star','share_helpful_reviews','min_days_between_reviews','product_ID','p_fake'], axis=1)
clusters_pt

cluster_ID
1     4439
2     3618
3      613
4     8162
5     1325
6     2061
7     3029
8     3468
9     1535
10     432
11    5249
12    6409
13     922
14    3562
15    3068
16    2479
17    1524
18    2509
19    1948
20    8233
Name: product_ID, dtype: int64
Shape of train and test: (3408, 16) (4439, 16)
0 0 0
Shape of train and test: (3408, 16) (3618, 16)
4 0 0
Shape of train and test: (3408, 16) (613, 16)
0 0 0
Shape of train and test: (3408, 16) (8162, 16)
1 0 0
Shape of train and test: (3408, 16) (1325, 16)
31 6 0
Shape of train and test: (3408, 16) (2061, 16)
1 0 0
Shape of train and test: (3408, 16) (3029, 16)
6 0 0
Shape of train and test: (3408, 16) (3468, 16)
138 13 4
Shape of train and test: (3408, 16) (1535, 16)
0 0 0
Shape of train and test: (3408, 16) (432, 16)
281 126 30
Shape of train and test: (3408, 16) (5249, 16)
32 7 0
Shape of train and test: (3408, 16) (6409, 16)
5 0 0
Shape of train and test: (3408, 16) (922, 16)
8 0 0
Shape of train and test: (3408, 16) (3562,

Unnamed: 0_level_0,clustering_coef,eigenvector_cent,share_photo,w_degree,n_of_reviews,max_days_between_reviews,pagerank,share_5star,tfidf_review_body,avg_days_between_reviews,stdev_days_between_reviews,avg_review_rating,std_review_len,share_1star,share_helpful_reviews,min_days_between_reviews,product_ID,p_fake
cluster_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,-0.449694,-0.938847,0.040628,-0.534468,-0.535591,-0.394908,-0.616731,-1.448677,-1.208245,-0.222322,-0.333507,-1.37654,-0.666436,1.158868,-0.494065,-0.304684,4439,0
2,-0.44609,0.490023,-0.156408,1.836494,1.308333,-0.424359,-0.461089,0.795046,0.054844,-1.061513,-0.678172,0.735312,-0.325867,-0.655024,-0.674659,-0.314785,3618,4
3,-0.304251,-0.886439,-0.312269,-0.636023,-0.386005,3.912262,1.683537,0.257764,-0.259546,2.04297,3.531381,0.235834,-0.287794,-0.21071,-0.076931,0.124099,613,0
4,-0.570373,-0.166414,-0.032322,-0.388206,-0.515417,-0.496516,-0.790426,-0.109667,-1.005601,-0.408221,-0.467693,0.010897,-0.708219,-0.171181,-0.56431,-0.314785,8162,1
5,0.64062,0.866913,-0.309412,-0.12022,-0.125278,-0.33828,0.35504,-0.474479,2.664775,-0.346173,-0.343336,-0.320813,3.493635,0.233983,1.656781,-0.265562,1325,31
6,-0.490083,-0.952834,0.057046,-0.598121,-0.491894,-0.40157,-0.240827,0.249172,-0.020008,-0.123817,-0.324207,0.266231,0.149844,-0.264395,2.255285,-0.28314,2061,1
7,-0.339017,-0.952583,-0.248612,-0.633919,-0.731735,-0.144636,-0.418016,0.348697,-0.850581,1.073774,0.148847,0.362322,-0.524818,-0.375418,-0.185241,4.294439,3029,6
8,-0.328782,0.107737,4.237893,-0.446213,-0.41673,-0.564059,-0.503415,0.613101,-0.476515,-0.61387,-0.565299,0.549834,-0.513597,-0.484736,-0.176373,-0.277172,3468,138
9,-0.421176,-0.965763,-0.112908,-0.598792,-0.460026,-0.20164,-0.026293,-2.784698,-0.56432,0.067954,-0.107757,-3.205405,-0.265808,3.583133,-0.055765,-0.006739,1535,0
10,2.392902,3.370968,-0.330767,1.932213,2.744075,-0.318898,3.274722,0.612676,0.06337,-1.17499,-0.697028,0.639973,-0.354781,-0.648018,-0.896827,-0.314785,432,281
