## Classification of accounts
This notebook uses our preprocessed data to determine what accounts are likely to be bots/not. We currently utilize 3 models
1. Random Forest Classifier
2. K-Nearest-Neighbors Classifier
3. Support Vector Classifier

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/accounts_processed.csv')
df.dropna(inplace = True)
df.dtypes

Unnamed: 0           object
id                   object
name                 object
screen_name          object
description          object
date_created         object
followers            object
following            object
last_updated         object
image_url            object
is_bot               object
category             object
bot_ratio           float64
f2f                 float64
bot_guess           float64
ave_polarity        float64
ave_subjectivity    float64
ave_length          float64
ave_hashtags        float64
ave_mentions        float64
dtype: object

### Split Data

In [3]:
# No user feedback
feats = ['f2f', 'bot_guess','ave_polarity', 'ave_subjectivity','ave_length', 'ave_hashtags', 'ave_mentions']
labs = ['category']
features = df[feats]
labels = df[labs]
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)

In [4]:
# With user feedback
feats = ['bot_ratio','f2f', 'bot_guess','ave_polarity', 'ave_subjectivity','ave_length', 'ave_hashtags', 'ave_mentions']
labs = ['category']
features = df[feats]
labels = df[labs]
X_train2, X_test2, y_train2, y_test2 = train_test_split(features, labels, test_size=0.33, random_state=42)

### Method 1: Random Forest Classifier

In [5]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf.fit(X_train, y_train)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [6]:
pred = clf.predict(X_test)
accuracy_score(y_test, pred)

0.3390844260409478

In [7]:
clf2 = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
clf2.fit(X_train2, y_train2)

  


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=2, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [8]:
pred = clf2.predict(X_test2)
accuracy_score(y_test2, pred)

0.33954451345755693

### Method 2: K-Nearest Neighbors Clustering

In [9]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit_transform(y_train)
le.transform(y_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


array([0, 0, 0, ..., 2, 0, 0])

In [10]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

  


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [11]:
pred = neigh.predict(X_test)
accuracy_score(y_test, pred)

0.3308795337780845

In [12]:
neigh2 = KNeighborsClassifier(n_neighbors=3)
neigh2.fit(X_train2, y_train2)

  


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='uniform')

In [13]:
pred = neigh2.predict(X_test2)
accuracy_score(y_test2, pred)

0.3305728088336784

### Method 3: Linear SVC

In [14]:
from sklearn.svm import SVC

In [15]:
svc = SVC(gamma='auto')
svc.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [16]:
pred = svc.predict(X_test)
accuracy_score(y_test, pred)

0.33187638984740436

In [17]:
svc2 = SVC(gamma='auto')
svc2.fit(X_train2, y_train2)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [18]:
pred = svc2.predict(X_test2)
accuracy_score(y_test2, pred)

0.33011272141706927