In [1]:
import numpy as np
import pandas as pd 
import matplotlib as plt

In [2]:
file = "user_fake_authentic_2class.csv"
df = pd.read_csv(file)
# Credit to Kristo Radion Purba! Check out his paper:
# K. R. Purba, D. Asirvatham and R. K. Murugesan,
# "Classification of instagram fake users using supervised machine learning algorithms,"
# International Journal of Electrical and Computer Engineering (IJECE),
# vol. 10, no. 3, pp. 2763-2772, 2020.

In [3]:
df.tail()

Unnamed: 0,pos,flw,flg,bl,pic,lin,cl,cz,ni,erl,erc,lt,hc,pr,fo,cs,pi,class
65321,13,145,642,0,1,0,7,0.461538,0.0,14.27,0.58,0.0,0.077,0.0,0.0,0.192308,1745.29126,r
65322,652,3000,1300,146,1,1,384,0.0,0.389,8.52,0.13,0.0,1.611,0.0,0.0,0.169917,54.62912,r
65323,1500,3700,3200,147,1,1,129,0.0,0.111,9.39,0.31,0.722,0.0,0.0,0.056,0.058908,129.802048,r
65324,329,1500,1800,218,1,1,290,0.055556,0.0,6.35,0.26,0.222,0.5,0.0,0.0,0.103174,53.40284,r
65325,206,659,608,27,1,0,77,0.0,0.333,25.549999,0.53,0.222,0.222,0.0,0.167,0.017505,604.981445,r


In [4]:
df.shape

(65326, 18)

In [5]:
# counting number of NAs
df.isna().sum().sum()

0

In [6]:
# choosing columns to train on
feature_cols = ['pos', 'flg', 'flw', 'bl', 'pic', 
                'lin', 'cl', 'ni', 'erl', 'erc', 
                'lt', 'hc', 'pi']
label_col = "class"

In [7]:
# separating labels from data
X = df[feature_cols]
y = df[label_col]
X.head(3)

Unnamed: 0,pos,flg,flw,bl,pic,lin,cl,ni,erl,erc,lt,hc,pi
0,44,325,48,33,1,0,12,0.0,0.0,0.0,0.0,0.0,0.094985
1,10,321,66,150,1,0,213,1.0,14.39,1.97,0.0,1.5,230.412857
2,33,308,970,101,1,1,436,1.0,10.1,0.3,0.0,2.5,43.569939


In [8]:
# turning labels into zeros and ones
def transform_label(label):
    if label == 0 or label == 1:
        return label
    if label == "f":
        return 1
    elif label == "r":
        return 0
    return ""
y = y.apply(transform_label)
y.tail(3)

65323    0
65324    0
65325    0
Name: class, dtype: int64

In [9]:
# splitting the data into training and validation data sets
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [10]:
# defining function to calculate error rate
def error_rate(predictions, val_y):
    err_count = sum(predictions != val_y)
    return err_count/len(val_y)

In [11]:
# fitting Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

n = 100
depth = 10
r = 1

forest_model = RandomForestClassifier(n_estimators=n, 
                                      max_depth=depth,
                                      random_state=r)
forest_model.fit(train_X, train_y)

forest_predictions = forest_model.predict(val_X)
forest_train_predictions = forest_model.predict(train_X)

In [12]:
print("Validation Error is", error_rate(forest_predictions, val_y))
print("Train Error is", error_rate(forest_train_predictions, train_y))

Validation Error is 0.10923340680871908
Train Error is 0.0979711801445075


In [13]:
# fitting Logistic Regression Model
from sklearn.linear_model import LogisticRegression

r = 1

logis_model = LogisticRegression(random_state=r)

logis_model.fit(train_X, train_y)

logis_predictions = logis_model.predict(val_X)
logis_train_predictions = logis_model.predict(train_X)

In [14]:
print("Validation Error is", error_rate(logis_predictions, val_y))
print("Train Error is", error_rate(logis_train_predictions, train_y))

Validation Error is 0.21148665197158953
Train Error is 0.20965832550924604


In [15]:
# fitting K Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier

k = 100

kn_model = KNeighborsClassifier(n_neighbors = k)

kn_model.fit(train_X, train_y)

kn_predictions = kn_model.predict(val_X)
kn_train_predictions = kn_model.predict(train_X)

In [16]:
print("Validation Error is", error_rate(kn_predictions, val_y))
print("Train Error is", error_rate(kn_train_predictions, train_y))

Validation Error is 0.24093803575802106
Train Error is 0.23470220843368575


In [17]:
# fitting Ada Boost Classifier
from sklearn.ensemble import AdaBoostClassifier

n = 75
lr = 1
r = 1

abc_model = AdaBoostClassifier(n_estimators=n, 
                               learning_rate=lr, 
                               random_state=r)

abc_model.fit(train_X, train_y)

abc_predictions = abc_model.predict(val_X)
abc_train_predictions = abc_model.predict(train_X)

In [18]:
print("Validation Error is", error_rate(abc_predictions, val_y))
print("Train Error is", error_rate(abc_train_predictions, train_y))

Validation Error is 0.13384766103355375
Train Error is 0.13599624443809447


In [19]:
# fitting Linear Regression Model
from sklearn.linear_model import LinearRegression

# using threshold to determine what predicted label is
thresh = 0.5

lr_model = LinearRegression()

lr_model.fit(train_X, train_y)

lr_predictions = lr_model.predict(val_X) >= thresh
lr_train_predictions = lr_model.predict(train_X) >= thresh

In [20]:
print("Validation Error is", error_rate(lr_predictions, val_y))
print("Train Error is", error_rate(lr_train_predictions, train_y))

Validation Error is 0.21130296350722508
Train Error is 0.21265869290117156


In [21]:
# finding most important feature in Random Forest Classifier Model
n = len(forest_model.feature_importances_)
indices = [i for i in range(n)]
max_i = max(indices, key=forest_model.feature_importances_.__getitem__)
feature_cols[max_i]

'lin'

In [22]:
# finding least important feature in Random Forest Classifier Model
n = len(forest_model.feature_importances_)
indices = [i for i in range(n)]
min_i = min(indices, key=forest_model.feature_importances_.__getitem__)
feature_cols[min_i]

'pic'