In [None]:
#import statements
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import networkx as nx
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
import operator
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.cluster import KMeans
import scipy

In [None]:
#Reading the in the dataset and checking the dataset read in properly
reviews_with_prod_label = pd.read_csv('reviews_with_prod_label.csv')
print(reviews_with_prod_label.head())
print(reviews_with_prod_label.columns)
print(reviews_with_prod_label.shape)

   Unnamed: 0  index  review_rating  number_of_helpful  \
0           0      0            5.0                0.0   
1           1      1            5.0                2.0   
2           2      2            5.0                0.0   
3           3      3            5.0                0.0   
4           4      4            5.0                0.0   

                                         review_body           review_title  \
0  JUST WHAT I THOUGHT IT WAS AND ORDER WENT SMOO...             Five Stars   
1  After I saw the motion picture, Heaven is for ...  Wallet photo of Jesus   
2  Great quality of image and prompty service, gr...             Five Stars   
3     Loved them gave them out to sunday bible study             Five Stars   
4       Arrived in perfect condition. No complaints.             Five Stars   

  review_date  number_of_photos  product_ID  reviewer_ID  fake  
0  2014-12-14                 0           0      2270578     0  
1  2014-12-13                 0           0   

In [None]:
#Converting review date to a datetime format
reviews_with_prod_label['review_date'] = pd.to_datetime(reviews_with_prod_label['review_date'])

#sort dataframe by product id and within product id sort by review date (earliest to latest)
reviews_with_prod_label = reviews_with_prod_label.sort_values(['product_ID', 'review_date'])

#get days between reviews for each review in product group
#first group by product id, diff gets the difference between each row's review date and previous review data within the same product
#then extract the nubmer of days from result using dt.days
reviews_with_prod_label['days_between_reviews'] = (
    reviews_with_prod_label.groupby('product_ID')['review_date']
      .diff()
      .dt.days
)

#fill days between reviews for first review of product with 0 instead of NA
reviews_with_prod_label['days_between_reviews'] = reviews_with_prod_label['days_between_reviews'].fillna(0)

#concatenate review title with review body into one feature
reviews_with_prod_label["text"] = reviews_with_prod_label["review_title"].fillna("") + " " + reviews_with_prod_label["review_body"].fillna("")
#make all text lowercase
reviews_with_prod_label["text"] = reviews_with_prod_label["text"].apply(lambda x: x.lower())

In [None]:
#obtain review features for each product by grouping reviews by product id, then aggregating other columns to get features
#ex. avg_review_rating comes from taking the mean of all of the reviews' ratings for each product
reviews_metadata = reviews_with_prod_label.groupby(["product_ID"]).agg(
    n_of_reviews = ('index', 'count'),
    avg_review_rating = ('review_rating', 'mean'),
    avg_days_between_reviews = ('days_between_reviews', 'mean'),
    stdev_days_between_reviews = ('days_between_reviews', 'std'),
    max_days_between_reviews = ('days_between_reviews', 'max'),
    min_days_between_reviews = ('days_between_reviews', 'min'),
    avg_helpful_reviews = ('number_of_helpful', lambda x: (x > 0).mean()),
    avg_1star_reviews = ('review_rating', lambda x: (x == 1).mean()),
    avg_5star_reviews = ('review_rating', lambda x: (x == 5).mean()),
    avg_photo_reviews = ('number_of_photos', lambda x: (x > 0).mean()),
    std_review_len = ('text', lambda x: x.apply(lambda y: len(y.split())).std())
)

#reset index
reviews_metadata = reviews_metadata.reset_index()
#add back in product id/fake label for each product
reviews_metadata = reviews_metadata.merge(reviews_with_prod_label, on='product_ID', how='left')
#remove duplicates of products, keep one instance of each product
reviews_metadata = reviews_metadata.drop_duplicates(subset=['product_ID'])
#keeping all columns related to metadata of reviews, with product id and label if buy fake reviews or not
reviews_metadata = reviews_metadata[['index', 'product_ID', 'n_of_reviews', 'avg_review_rating', 'avg_days_between_reviews', 'stdev_days_between_reviews', 'max_days_between_reviews', 'min_days_between_reviews', 'avg_helpful_reviews', 'avg_1star_reviews', 'avg_5star_reviews', 'avg_photo_reviews', 'std_review_len', 'fake']]

In [None]:
#define X as products, as will use products to uniquely identify cases for each type of data (review metadata, network features, text features)
X = reviews_metadata[["product_ID"]]
#define y as fake label and corresponding product id so we can always use product id to rejoin to other dataframes
y = reviews_metadata[["fake", "product_ID"]]

#80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 42)

print(X_train.shape)
print(X_test.shape)

(2726, 1)
(682, 1)


In [None]:
#define training review text as text of reviews that belong to products whose product ids are in the training set of product ids
X_train_text = reviews_with_prod_label[reviews_with_prod_label["product_ID"].isin(X_train["product_ID"])]
#repeat same process for test text
X_test_text = reviews_with_prod_label[reviews_with_prod_label["product_ID"].isin(X_test["product_ID"])]

In [None]:
#combine text of all reviews in the same product id into one line of text associated with product id
X_train_text_combined = (
    X_train_text.groupby('product_ID')['text']
    .apply(lambda x: ' '.join(x))
    .reset_index()
)

#repeat for test text
X_test_text_combined = (
    X_test_text.groupby('product_ID')['text']
    .apply(lambda x: ' '.join(x))
    .reset_index()
)

In [None]:
#create tfidf object that that takes top 1000
tfidf = TfidfVectorizer(max_features=1000)
#fit transform on the combined training text for each product
X_train_text_tfidf = tfidf.fit_transform(X_train_text_combined["text"])
#transform the combined test text for each product
X_test_text_tfidf = tfidf.transform(X_test_text_combined["text"])
#get feature names (words/numbers that where the top 1000 features)
tfidf_features = tfidf.get_feature_names_out()

#convert from sparse representation to dense representation
X_train_text_tfidf = X_train_text_tfidf.toarray()
X_test_text_tfidf = X_test_text_tfidf.toarray()

#make tfidf representations dataframes with tfidf information as values, and features as column names
X_train_text_tfidf = pd.DataFrame(X_train_text_tfidf, columns = tfidf_features)
X_test_text_tfidf = pd.DataFrame(X_test_text_tfidf, columns = tfidf_features)

In [None]:
#concatenate X_train_text_combined that has product id with training tfidf representation
X_train_text_tfidf_pid = pd.concat([X_train_text_combined.reset_index(drop=True), X_train_text_tfidf.reset_index(drop=True)], axis=1)
#repeat for X_test_text_combined and tfidf of test set
X_test_text_tfidf_pid = pd.concat([X_test_text_combined.reset_index(drop=True), X_test_text_tfidf.reset_index(drop=True)], axis=1)

#drop text column so only have product id and features of tfidf for each product in training set
X_train_tfidf = X_train_text_tfidf_pid.drop(columns=['text'])
#repeat for test set
X_test_tfidf = X_test_text_tfidf_pid.drop(columns=['text'])

In [None]:
#get X_train of reviews metadata by matching product ids in reviews metadata to product ids in X_train
X_train_metadata = reviews_metadata[reviews_metadata["product_ID"].isin(X_train["product_ID"])]
#repeat for X_test of reviews metadata
X_test_metadata = reviews_metadata[reviews_metadata["product_ID"].isin(X_test["product_ID"])]

#subset X_train_metdata to only keep features related to review metadata, except also keep product id
X_train_metadata = X_train_metadata[['product_ID', 'n_of_reviews', 'avg_review_rating', 'avg_days_between_reviews', 'stdev_days_between_reviews', 'max_days_between_reviews', 'min_days_between_reviews', 'avg_helpful_reviews', 'avg_1star_reviews', 'avg_5star_reviews', 'avg_photo_reviews', 'std_review_len']]
#repeat for X_test_metadata
X_test_metadata = X_test_metadata[['product_ID', 'n_of_reviews', 'avg_review_rating', 'avg_days_between_reviews', 'stdev_days_between_reviews', 'max_days_between_reviews', 'min_days_between_reviews', 'avg_helpful_reviews', 'avg_1star_reviews', 'avg_5star_reviews', 'avg_photo_reviews', 'std_review_len']]

In [None]:
def weighted_projected_graph(B, nodes, ratio=False):
  if B.is_directed():
    pred = B.pred
    G = nx.DiGraph()
  else:
    pred = B.adj
    G = nx.Graph()
  G.graph.update(B.graph)
  G.add_nodes_from((n, B.nodes[n]) for n in nodes)
  n_top = float(len(B) - len(nodes))
  nodes_checked = []
  for u in nodes:
    nodes_checked.append(u)
    unbrs = set(B[u])
    nbrs2 = {n for nbr in unbrs for n in B[nbr]} - set(nodes_checked)
    for v in nbrs2:
      vnbrs = set(pred[v])
      common = unbrs & vnbrs
      if not ratio:
        weight = len(common)
      else:
        weight = len(common) / n_top
      G.add_edge(u, v, weight=weight)
  return G

def obtain_network_features(reviews):
  # initializing the product-level data
  df = pd.DataFrame({"product_ID": reviews.product_ID.unique()})

  # building the bipartite product-reviewer graph
  B = nx.Graph()
  B.add_nodes_from(reviews.reviewer_ID, bipartite=0)
  B.add_nodes_from(reviews.product_ID, bipartite=1)
  B.add_edges_from([(row['reviewer_ID'], row['product_ID']) for idx, row in reviews.iterrows()])

  # building the product projected graph
  P = weighted_projected_graph(B, reviews.product_ID.unique())

  w_degree = nx.degree(P, weight='weight')
  cc = nx.clustering(P)
  pr = nx.pagerank(P, alpha=0.85)
  eig_cent = nx.eigenvector_centrality(P, max_iter=500)

  # creating the features data
  df['w_degree'] = [w_degree[i] for i in df.product_ID]
  df['clustering_coef'] = [cc[i] for i in df.product_ID]
  df['pagerank'] = [pr[i] for i in df.product_ID]
  df['eigenvector_cent'] = [eig_cent[i] for i in df.product_ID]
  return df

#define network features for training set of products and reviwers in training set of reviews
X_train_network = obtain_network_features(X_train_text)
#repeat for test set
X_test_network = obtain_network_features(X_test_text)

In [None]:
#drop duplicates of network featuers so only have one instance network features for each product in the training set
X_train_with_network = X_train_network.drop_duplicates(subset=['product_ID'])
#do the same for the set of products in the testing set
X_test_with_network = X_test_network.drop_duplicates(subset=['product_ID'])

#subset X_train_with_network to include only network features and product id
X_train_with_network = X_train_with_network[['product_ID', 'w_degree', 'clustering_coef', 'pagerank', 'eigenvector_cent']]
#repeat for text set
X_test_with_network = X_test_with_network[['product_ID', 'w_degree', 'clustering_coef', 'pagerank', 'eigenvector_cent']]

In [None]:
#defining network, reviews, and text features
network_features = ['pagerank', 'w_degree', 'clustering_coef', 'eigenvector_cent']

review_features = ['n_of_reviews','avg_review_rating',
                   'avg_days_between_reviews', 'stdev_days_between_reviews',
                   'max_days_between_reviews', 'min_days_between_reviews',
                   'avg_helpful_reviews', 'avg_1star_reviews', 'avg_5star_reviews', 'avg_photo_reviews', 'std_review_len']

text_features = tfidf_features.tolist()

In [None]:
#merge all features subsets together for training set based on product id
X_train_all_features = pd.merge(pd.merge(X_train_metadata, X_train_with_network, on='product_ID', how='inner'), X_train_tfidf, on='product_ID', how='inner')
#repeat for testing set
X_test_all_features = pd.merge(pd.merge(X_test_metadata, X_test_with_network, on='product_ID', how='inner'), X_test_tfidf, on='product_ID', how='inner')

#create standard scaled object
scaler = StandardScaler()
#fit and transform all features for training set, excluding product id
X_train_all_features_scaled = scaler.fit_transform(X_train_all_features[network_features + review_features + text_features])
#transform all features for test set, excluding product id
X_test_all_features_scaled = scaler.transform(X_test_all_features[network_features + review_features + text_features])

#transform scaled array to dataframe with sacled values and features names as column names for training set
X_train_all_features_scaled = pd.DataFrame(X_train_all_features_scaled, columns = network_features + review_features + text_features)
#repeat for test set
X_test_all_features_scaled = pd.DataFrame(X_test_all_features_scaled, columns = network_features + review_features + text_features)

#add back in product id to training set (unscaled)
X_train_all_features_scaled["product_ID"] = X_train_all_features["product_ID"]
#repeat for test set
X_test_all_features_scaled["product_ID"] = X_test_all_features["product_ID"]

In [None]:
#get labels from y_train, as y_train has labels and product id
y_train_labels = y_train[["fake"]]
#repeat for y_test
y_test_labels = y_test[["fake"]]

#flatten dimensionality of y_train_labels so just array of values (prep for ml model inputs)
y_train_labels = y_train_labels.values.ravel()
#repeat for y_test_labels
y_test_labels = y_test_labels.values.ravel()

##Supervised

In [None]:
def model_building(X_train, y_train, X_test, y_test, model):
	#general function to evaluate model performance on model and data passed in

	#fit model
	model.fit(X_train, y_train)

	#obtain label predictions on test set
	y_pred = model.predict(X_test)
	#get confusion matrix
	cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
	#get probability predictions using model on test set
	probs = model.predict_proba(X_test)[:,1]
	#get accuracy, precision, recall, f1
	accuracy_score = metrics.accuracy_score(y_test, y_pred)
	print("Accuracy", accuracy_score)
	precision = metrics.precision_score(y_test, y_pred, zero_division = 0)
	print("Precision", precision)
	recall = metrics.recall_score(y_test, y_pred, zero_division = 0)
	print("Recall", recall)
	f1 = metrics.f1_score(y_test, y_pred, zero_division = 0)
	print("F1 Score", f1)
	print("\n")

	#print out metrics
	print("AUC, Accuracy, TN, TP, F1 Score")
	print("{}, {}, {}, {}, {}".format(metrics.roc_auc_score(y_test, model.predict_proba(X_test)[:,1]),
															  sum(cm.diagonal()) / X_test.shape[0],
															  cm[0,0] / sum(cm[0,:]),
															  cm[1,1] / sum(cm[1,:]),
															  metrics.f1_score(y_test, y_pred, average='weighted')))
	#return predicted probabilties on test set using model passed in
	return probs

In [None]:
def classification_results(X_train, X_test, features=None):
  #subset training and test set to features want to use for this function call
  X_train = X_train[features]
  X_test = X_test[features]

  #build loigistic regression model and evaluate on test set
  print("="*10 + "Logistic Regression" + "="*10)
  model = LogisticRegression(max_iter=400)
  model_building(X_train, y_train_labels, X_test, y_test_labels, model)

  #build random forest classifier and evaluate on test set
  print("="*10 + "Random Forest" + "="*10)
  model = RandomForestClassifier(random_state=42,
                                 n_estimators=100,
	                               min_samples_leaf=3,
	                               min_samples_split=6,
	                               max_features='sqrt',
	                               max_depth=40,
	                               bootstrap=True,
	                               n_jobs=-1)
  model_building(X_train, y_train_labels, X_test, y_test_labels, model)

  #get feature importance from random forest classifier
  print("="*10 + "RF Feature Importance" + "="*10)
  imps = model.feature_importances_
  feat_imp = {features[i]: imps[i] for i in range(len(features))}

  #if more than 100 features, only output top 50 features and their importance
  if len(features) > 100:
    print(sorted(feat_imp.items(), key=operator.itemgetter(1), reverse=True)[:50])
  #if elss than 100 features, outupt all features and their importance
  else:
    print(sorted(feat_imp.items(), key=operator.itemgetter(1), reverse=True))

  #build SVC linear classifier and evaluate on test set
  print("="*10 + "SVC Linear" + "="*10)
  model = SVC(kernel='linear', probability=True)
  model_building(X_train, y_train_labels, X_test, y_test_labels, model)

  #build xg boost classifier and evaluate on test set
  print("="*10 + "XGBoost" + "="*10)
  model = xgb.XGBClassifier()
  model_building(X_train, y_train_labels, X_test, y_test_labels, model)

  return

In [None]:
#call set of models on scaled review features
classification_results(X_train_all_features_scaled, X_test_all_features_scaled, features=review_features)

Accuracy 0.592375366568915
Precision 0.28
Recall 0.026217228464419477
F1 Score 0.04794520547945205


AUC, Accuracy, TN, TP, F1 Score
0.48347998736519116, 0.592375366568915, 0.9566265060240964, 0.026217228464419477, 0.46947228915879363
Accuracy 0.5557184750733137
Precision 0.4117647058823529
Recall 0.3146067415730337
F1 Score 0.35668789808917195


AUC, Accuracy, TN, TP, F1 Score
0.5078200442218312, 0.5557184750733137, 0.7108433734939759, 0.3146067415730337, 0.5416771241774561
[('std_review_len', np.float64(0.11478019642224352)), ('avg_review_rating', np.float64(0.10403696987092186)), ('avg_1star_reviews', np.float64(0.10380726195763872)), ('avg_5star_reviews', np.float64(0.10308085108255806)), ('stdev_days_between_reviews', np.float64(0.10307972614375453)), ('avg_days_between_reviews', np.float64(0.10131923580170062)), ('max_days_between_reviews', np.float64(0.10127734261813234)), ('avg_helpful_reviews', np.float64(0.09998912593717278)), ('n_of_reviews', np.float64(0.09794708631655567))

In [None]:
#call set of models on scaled network features
classification_results(X_train_all_features_scaled, X_test_all_features_scaled, features=network_features)

Accuracy 0.6085043988269795
Precision 0.0
Recall 0.0
F1 Score 0.0


AUC, Accuracy, TN, TP, F1 Score
0.4660845629709851, 0.6085043988269795, 1.0, 0.0, 0.46039986419908197
Accuracy 0.5850439882697948
Precision 0.4375
Recall 0.20973782771535582
F1 Score 0.28354430379746837


AUC, Accuracy, TN, TP, F1 Score
0.5042191236857543, 0.5850439882697948, 0.8265060240963855, 0.20973782771535582, 0.5417948075250545
[('eigenvector_cent', np.float64(0.27859043760824637)), ('pagerank', np.float64(0.2744448176320625)), ('clustering_coef', np.float64(0.26668286171537386)), ('w_degree', np.float64(0.18028188304431725))]
Accuracy 0.6085043988269795
Precision 0.0
Recall 0.0
F1 Score 0.0


AUC, Accuracy, TN, TP, F1 Score
0.5198005505166734, 0.6085043988269795, 1.0, 0.0, 0.46039986419908197
Accuracy 0.5953079178885631
Precision 0.4482758620689655
Recall 0.14606741573033707
F1 Score 0.22033898305084745


AUC, Accuracy, TN, TP, F1 Score
0.5293894679842968, 0.5953079178885631, 0.8843373493975903, 0.1460674157303

In [None]:
#call set of models on what paper thinks is top two network features
classification_results(X_train_all_features_scaled, X_test_all_features_scaled, features=['eigenvector_cent', 'clustering_coef'])

Accuracy 0.6055718475073314
Precision 0.45454545454545453
Recall 0.03745318352059925
F1 Score 0.06920415224913495


AUC, Accuracy, TN, TP, F1 Score
0.5077794323360858, 0.6055718475073314, 0.9710843373493976, 0.03745318352059925, 0.48332990765778894
Accuracy 0.5293255131964809
Precision 0.3761467889908257
Recall 0.30711610486891383
F1 Score 0.33814432989690724


AUC, Accuracy, TN, TP, F1 Score
0.49036595821488205, 0.5293255131964809, 0.672289156626506, 0.30711610486891383, 0.5186680865961634
[('eigenvector_cent', np.float64(0.5142366728943509)), ('clustering_coef', np.float64(0.48576332710564907))]
Accuracy 0.6085043988269795
Precision 0.0
Recall 0.0
F1 Score 0.0


AUC, Accuracy, TN, TP, F1 Score
0.4924552141148866, 0.6085043988269795, 1.0, 0.0, 0.46039986419908197
Accuracy 0.5058651026392962
Precision 0.36328125
Recall 0.34831460674157305
F1 Score 0.35564053537284895


AUC, Accuracy, TN, TP, F1 Score
0.4957808763142457, 0.5058651026392962, 0.6072289156626506, 0.34831460674157305, 0.503

In [None]:
#call set of models on review and network features
classification_results(X_train_all_features_scaled, X_test_all_features_scaled, features=review_features + network_features)

Accuracy 0.5997067448680352
Precision 0.3125
Recall 0.018726591760299626
F1 Score 0.0353356890459364


AUC, Accuracy, TN, TP, F1 Score
0.4720545101755336, 0.5997067448680352, 0.9734939759036144, 0.018726591760299626, 0.46866406678168293
Accuracy 0.5557184750733137
Precision 0.4072164948453608
Recall 0.2958801498127341
F1 Score 0.34273318872017355


AUC, Accuracy, TN, TP, F1 Score
0.48469834393754796, 0.5557184750733137, 0.7228915662650602, 0.2958801498127341, 0.5385003954456513
[('std_review_len', np.float64(0.08060696261899274)), ('clustering_coef', np.float64(0.07742946411016653)), ('avg_days_between_reviews', np.float64(0.07727446767344669)), ('avg_5star_reviews', np.float64(0.07490643917021049)), ('avg_1star_reviews', np.float64(0.07446406747578954)), ('max_days_between_reviews', np.float64(0.07338658646051058)), ('avg_review_rating', np.float64(0.0730099668948618)), ('eigenvector_cent', np.float64(0.07260744282496082)), ('stdev_days_between_reviews', np.float64(0.07228797882605965

In [None]:
#call set of models on text features
classification_results(X_train_all_features_scaled, X_test_all_features_scaled, features = text_features)

Accuracy 0.49266862170087977
Precision 0.3696369636963696
Recall 0.41947565543071164
F1 Score 0.3929824561403509


AUC, Accuracy, TN, TP, F1 Score
0.5061730066332747, 0.49266862170087977, 0.5397590361445783, 0.41947565543071164, 0.4971883974693672
Accuracy 0.5733137829912024
Precision 0.3723404255319149
Recall 0.13108614232209737
F1 Score 0.19390581717451524


AUC, Accuracy, TN, TP, F1 Score
0.5120572176345833, 0.5733137829912024, 0.8578313253012049, 0.13108614232209737, 0.5078725286678855
[('came', np.float64(0.0026631307067331716)), ('off', np.float64(0.0024532731472602567)), ('expected', np.float64(0.0024078506441855933)), ('did', np.float64(0.0023927343601812157)), ('our', np.float64(0.002312392853852551)), ('after', np.float64(0.002295352495242248)), ('through', np.float64(0.002243966766814618)), ('from', np.float64(0.0022190445175629477)), ('is', np.float64(0.0021515478202703536)), ('again', np.float64(0.002146503444969373)), ('two', np.float64(0.0020598439375503503)), ('however'

# Clustering - Into Supervised Learning

In [None]:
#set k for number of clusters want
k_clusters = 5
#define k means clustering object
kmeans = KMeans(n_clusters=k_clusters, random_state=42)
#fit k means object to X_train features
kmeans.fit(X_train_all_features_scaled)

#get training labels from k means
train_labels = kmeans.labels_
#predict labels for X_test using k means and store labels
test_labels = kmeans.predict(X_test_all_features_scaled)

#add cluster labels to train/test set as column
X_train_all_features_scaled['cluster_ID'] = train_labels + 1
X_test_all_features_scaled['cluster_ID'] = test_labels + 1
#sort test set in ascending order based on cluster id
X_test_all_features_scaled = X_test_all_features_scaled.sort_values('cluster_ID')

print("Number of products in each cluster:", X_train_all_features_scaled.groupby('cluster_ID')['product_ID'].count())
print("\n")

#define lists to store metrics for each cluster to average at the end
accuracy_score_lst = []
precision_lst = []
recall_lst = []
f1_lst = []
auc_lst = []
pred_probs = []

#iterate through each cluster
for i in range(k_clusters):
  #print which cluster working on
  print("Cluster", (i + 1))
  #get training data and testing data that belong to this cluster based on k means labels
  X_train_cluster = X_train_all_features_scaled[train_labels == i]
  X_test_cluster = X_test_all_features_scaled[test_labels == i]

  #get corresponding y_train and y_test labels based on matching X_training/X_test product ids to product ids of y_train/y_test
  y_train_cluster = y_train[y_train['product_ID'].isin(X_train_cluster['product_ID'])]
  y_test_cluster = y_test[y_test['product_ID'].isin(X_test_cluster['product_ID'])]

  #subset y_train for this cluster to only have labels (fake)
  y_train_cluster = y_train_cluster[['fake']]
  #repeat for y_test for this cluster
  y_test_cluster = y_test_cluster[['fake']]

  #flatten dimensionality of y_train so just array of values of labels (prep for ml model inputs)
  y_train_cluster = y_train_cluster.values.ravel()
  #repeat for y_test
  y_test_cluster = y_test_cluster.values.ravel()

  #create random forest classifier object
  model = RandomForestClassifier(random_state=42,
	                               n_estimators=1200,
	                               min_samples_leaf=3,
	                               min_samples_split=6,
	                               max_features='sqrt',
	                               max_depth=40,
	                               bootstrap=True,
	                               n_jobs=-1)

  #fit model to X_train and y_train for this cluster
  model.fit(X_train_cluster, y_train_cluster)
  #predict labels based on on X_test data for this cluster
  y_pred = model.predict(X_test_cluster)
  #predict probabiltiy that product buy fake reviews
  y_pred_probs = model.predict_proba(X_test_cluster)[:,1]
  #append predicted probabilties to list, so have all predicted probabilties across entire test set at end of loop
  pred_probs.append(y_pred_probs)
  #number of products identified as buying fake reviews with different thresholds
  print("The number of products identified as products that buy fake reviews with a threshold of 0.5:", sum(y_pred_probs >= 0.5))
  print("The number of products identified as products that buy fake reviews with a threshold of 0.6:", sum(y_pred_probs >= 0.6))
  print("The number of products identified as products that buy fake reviews with a threshold of 0.7:", sum(y_pred_probs >= 0.7))


  #get accuracy for this cluster, append accuracy to accuracy list that will contain accuracy values for each cluster at the end of the loop
  accuracy_score = metrics.accuracy_score(y_test_cluster, y_pred)
  #append accuracy to accuracy list that will contain accuracy values for each cluster at the end of the loop
  accuracy_score_lst.append(accuracy_score)
  #print out accuracy for this cluster
  print("Accuracy", accuracy_score)
  #repeat steps for precision
  precision = metrics.precision_score(y_test_cluster, y_pred, zero_division = 0)
  precision_lst.append(precision)
  print("Precision", precision)
  #repeat steps for recall
  recall = metrics.recall_score(y_test_cluster, y_pred, zero_division = 0)
  recall_lst.append(recall)
  print("Recall", recall)
  #repeat steps for f1
  f1 = metrics.f1_score(y_test_cluster, y_pred, zero_division = 0)
  f1_lst.append(f1)
  print("F1 Score", f1)
  #repeat steps for AUC
  auc = metrics.roc_auc_score(y_test_cluster, model.predict_proba(X_test_cluster)[:,1])
  auc_lst.append(auc)
  print("AUC", auc)
  print("***********")
  print("\n")


#print average accuracy across clusters by averaging values in accuracy_score_lst
#repeat for all metrics
print("\n")
print("Average Across all Clusters")
average_accuracy = sum(accuracy_score_lst) / len(accuracy_score_lst)
print("Accuracy", average_accuracy)
average_precision = sum(precision_lst) / len(precision_lst)
print("Precision", average_precision)
average_recall = sum(recall_lst) / len(recall_lst)
print("Recall", average_recall)
average_f1 = sum(f1_lst) / len(f1_lst)
print("F1 Score", average_f1)
average_auc = sum(auc_lst) / len(auc_lst)
print("AUC", average_auc)

Number of products in each cluster: cluster_ID
1    537
2    567
3    518
4    563
5    541
Name: product_ID, dtype: int64


Cluster 1
The number of products identified as products that buy fake reviews with a threshold of 0.5: 13
The number of products identified as products that buy fake reviews with a threshold of 0.6: 0
The number of products identified as products that buy fake reviews with a threshold of 0.7: 0
Accuracy 0.8297872340425532
Precision 0.07692307692307693
Recall 0.07692307692307693
F1 Score 0.07692307692307693
AUC 0.49459134615384615
***********


Cluster 2
The number of products identified as products that buy fake reviews with a threshold of 0.5: 122
The number of products identified as products that buy fake reviews with a threshold of 0.6: 122
The number of products identified as products that buy fake reviews with a threshold of 0.7: 121
Accuracy 0.22950819672131148
Precision 0.22950819672131148
Recall 1.0
F1 Score 0.37333333333333335
AUC 0.5113981762917933
****

In [None]:
#flatten arrays into one list
pred_probs = np.concatenate(pred_probs).tolist()
#add list of predictions to X_test as column
X_test_all_features_scaled['p_fake'] = pred_probs
#create pivot table of mean values for each feature across cluster ids
#p_fake is the number of products identified as products that buy fake reviews based on a thershold of 0.5 in each cluster
clusters_pt = X_test_all_features_scaled.pivot_table(index='cluster_ID', aggfunc={
                                'clustering_coef': 'mean',
																'eigenvector_cent': 'mean',
																'avg_photo_reviews': 'mean',
																'w_degree': 'mean',
																'n_of_reviews': 'mean',
																'max_days_between_reviews':'mean',
																'pagerank':'mean',
																'avg_5star_reviews':'mean',
                                'avg_days_between_reviews':'mean',
                                'stdev_days_between_reviews':'mean',
                                'avg_review_rating':'mean',
																'std_review_len':'mean',
                                'avg_1star_reviews':'mean',
                                'avg_helpful_reviews':'mean',
																'min_days_between_reviews':'mean',
                                'product_ID':'count',
                                'p_fake':lambda x:(x>=0.5).sum(),}
)

#standardize values in pivot table by z score
clusters_pt[review_features + network_features] = scipy.stats.zscore(clusters_pt[review_features + network_features])
#reorder features based on parent paper importance scores in the random forest classifier
clusters_pt = clusters_pt.reindex(['clustering_coef','eigenvector_cent',
									'avg_photo_reviews','w_degree','n_of_reviews','max_days_between_reviews',
									'pagerank','avg_5star_reviews','avg_days_between_reviews',
									'stdev_days_between_reviews','avg_review_rating','std_review_len',
									'avg_1star_reviews','avg_helpful_reviews','min_days_between_reviews','product_ID','p_fake'], axis=1)
clusters_pt

Unnamed: 0_level_0,clustering_coef,eigenvector_cent,avg_photo_reviews,w_degree,n_of_reviews,max_days_between_reviews,pagerank,avg_5star_reviews,avg_days_between_reviews,stdev_days_between_reviews,avg_review_rating,std_review_len,avg_1star_reviews,avg_helpful_reviews,min_days_between_reviews,product_ID,p_fake
cluster_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,-0.467836,0.30123,1.76153,0.156124,1.007482,-0.713532,-0.305503,0.446822,-0.953102,-0.894197,0.407514,-1.052715,-0.152997,-1.033721,,141,13
2,-1.490958,-1.222219,0.104735,-0.672853,-1.104972,-0.954056,0.562551,-1.162054,-0.369221,-0.722083,-0.585912,-0.878354,-0.337226,-0.539321,,122,122
3,-0.207294,1.346127,-1.312719,1.321303,0.537932,1.884891,0.199406,0.371724,1.920086,1.880321,-0.328384,1.658077,1.089696,1.894304,,162,0
4,1.373936,-1.082342,-0.346076,-1.509848,-1.310554,-0.062691,-1.719245,-1.10355,-0.086888,0.10094,-1.205099,0.545031,1.022335,-0.177391,,121,117
5,0.792151,0.657204,-0.207469,0.705274,0.870112,-0.154612,1.262791,1.447059,-0.510875,-0.364981,1.711882,-0.27204,-1.621809,-0.143871,,136,0
