In [15]:
import numpy as np
import pandas as pd

from scipy.stats import entropy
import os

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

import statsmodels.api as sm
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression


import matplotlib.pyplot as plt
%matplotlib inline



In [16]:
names=['id','id_str','screen_name','location','description','url','followers_count','friends_count','listedcount','created_at','favourites_count','verified','statuses_count','lang','status','default_profile','default_profile_image','has_extended_profile','name','bot']

In [17]:
df = pd.read_csv('merged.csv',encoding="ISO-8859-1",
                 header=None,delim_whitespace=False,names=names,na_values='?',)

In [18]:
df.verified=df.verified.astype('bool')
df.verified=df.verified.astype(int)
df.default_profile=df.default_profile.astype('bool')
df.default_profile=df.default_profile.astype(int)
df.default_profile_image=df.default_profile_image.astype('bool')
df.default_profile_image=df.default_profile_image.astype(int)


df.followers_count = df.followers_count.astype(int)
df.friends_count = df.friends_count.astype(int)
df.listedcount = df.listedcount.astype(int)
df.favourites_count = df.favourites_count.astype(int)
df.statuses_count = df.statuses_count.astype(int)
df.bot = df.bot.astype(int)

In [19]:
# Creating more features from given dataframe
df["screen_name_len"] = [len(i) for i in df["screen_name"]]
df["bot_is_substr"] = [int('bot' in i.lower()) for i in df["screen_name"]]
df["bot_in_des"] = [int('bot' in str(i).lower()) for i in df['description']]

# Getting the ages in years from created_at
ages = []
for i in df["created_at"]:
    if len(i) > 20:
        i = i.split()
        year = int(i[5][:4]) % 2000 
    else:
        i = i.split('/')
        year = int(i[2].split()[0]) % 2000
    age = 17-year
    ages.append(age)
df["age"] = ages

In [20]:
#for a in range(1000):
#    randomized_Data = df.reindex(np.random.permutation(df.index))

#train_df = randomized_Data[:int((len(randomized_Data)+1)*.80)]
#test_df = randomized_Data[int((len(randomized_Data)+1)*.80):]

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

count_vect = CountVectorizer()
df_counts = count_vect.fit_transform(df['description'].values.astype('U'))
tf_transformer = TfidfTransformer(use_idf=False).fit(df_counts)
df_tf = tf_transformer.transform(df_counts)


numerical_desc = df_tf.toarray()

In [21]:
# clfNames are only the numerical and binary features with importance > 0 
clfNames=['age','screen_name_len', 'bot_is_substr','bot_in_des','followers_count','friends_count','listedcount', 'favourites_count','statuses_count']

# Getting the training sets and fitting the tree
df_train = df[clfNames+['bot']]

numerical_desc = df_tf.toarray()

#for i in range(7748):
#    feat_name = 'tf' + str(i)
    
#    df_train[feat_name] = numerical_desc[:,i]

for a in range(1000):
    randomized_Data = df_train.reindex(np.random.permutation(df_train.index))

train_df = randomized_Data[:int((len(randomized_Data)+1)*.80)]
test_df = randomized_Data[int((len(randomized_Data)+1)*.80):]

train_df.head()

Unnamed: 0,age,screen_name_len,bot_is_substr,bot_in_des,followers_count,friends_count,listedcount,favourites_count,statuses_count,bot
581,2,13,0,0,168,224,29,432,7028,1
630,3,17,0,0,567,0,2,0,16733,1
90,7,11,1,1,1784,43,89,5,389,1
1590,7,12,0,0,136,300,5,95,1748,0
44,0,12,0,0,0,27,0,38,60,1


In [22]:
x_train = train_df.drop('bot', 1)
y_train = train_df['bot'].astype(bool)


x_test = test_df.drop('bot', 1)
y_test = test_df['bot'].astype(bool)

dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(x_train, y_train)

accuracy = cross_val_score(dt, x_train, y_train, cv=5, scoring = "accuracy").mean()
precision = cross_val_score(dt, x_train, y_train, cv=5, scoring = "precision").mean()

print(accuracy, precision)


0.872272784577 0.863929754453


In [23]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(x_train, y_train)

accuracy = cross_val_score(gnb, x_train, y_train, cv=5, scoring = "accuracy").mean()
precision = cross_val_score(gnb, x_train, y_train, cv=5, scoring = "precision").mean()

print(accuracy, precision)


0.621818136582 0.554465175212


In [24]:
from sklearn import svm 

svclf = svm.SVC()

svclf.fit(x_train, y_train)

accuracy = cross_val_score(svclf, x_train, y_train, cv=5, scoring = "accuracy").mean()
precision = cross_val_score(svclf, x_train, y_train, cv=5, scoring = "precision").mean()

svclf.predict(x_test)

print(accuracy, precision)

0.542295608847 0.813888888889
