In [57]:
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix, hstack
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import ast


In [195]:
data = pd.read_csv("../train_data/combine.csv", sep=",", lineterminator='\n')


In [196]:
data.columns

Index(['Unnamed: 0', 'no', 'id', 'spam', 'source', 'tweet', 'created_at',
       'followers_count', 'friends_count', 'hashtags', 'symbols', 'text',
       'urls', 'user_description', 'user_id', 'user_location', 'user_name',
       'verified'],
      dtype='object')

In [197]:
data.drop(['Unnamed: 0',"no","tweet","symbols"] ,axis=1, inplace=True)


In [198]:
data[data['spam'].isnull()]

Unnamed: 0,id,spam,source,created_at,followers_count,friends_count,hashtags,text,urls,user_description,user_id,user_location,user_name,verified


In [199]:
data = data[~(data["id"] == "id")]

In [200]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9828 entries, 0 to 9847
Data columns (total 14 columns):
id                  9828 non-null object
spam                9828 non-null object
source              9828 non-null object
created_at          9828 non-null object
followers_count     9828 non-null object
friends_count       9828 non-null object
hashtags            9828 non-null object
text                9828 non-null object
urls                9828 non-null object
user_description    8139 non-null object
user_id             9828 non-null object
user_location       6887 non-null object
user_name           9824 non-null object
verified            9828 non-null object
dtypes: object(14)
memory usage: 1.1+ MB


In [201]:
X = data["text"].values
y = data['spam'].values


In [202]:
data["verified_num"] = data["verified"].apply(lambda x: 1 if x == 'True' else 0)


In [203]:
data["followers_count"] = data["followers_count"].apply(convert_float)

In [204]:
data["user_id"] = data["user_id"].apply(convert_float)

In [205]:
data["friends_count"] = data["friends_count"].apply(convert_float)

In [206]:
data["verified_num"] = data["verified_num"].apply(convert_float)

In [207]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9828 entries, 0 to 9847
Data columns (total 15 columns):
id                  9828 non-null object
spam                9828 non-null object
source              9828 non-null object
created_at          9828 non-null object
followers_count     9828 non-null float64
friends_count       9828 non-null float64
hashtags            9828 non-null object
text                9828 non-null object
urls                9828 non-null object
user_description    8139 non-null object
user_id             9828 non-null float64
user_location       6887 non-null object
user_name           9824 non-null object
verified            9828 non-null object
verified_num        9828 non-null float64
dtypes: float64(4), object(11)
memory usage: 1.2+ MB


In [208]:
def get_hashtags(tag):
    ans = []
    try:
        tag = ast.literal_eval(tag)
        for i in tag:
            if len(tag) > 0:
                ans.append(i["text"])
    except Exception:
        pass
    ans  = " ".join(str(x) for x in ans)
    return ans

In [209]:
data["hashtags"] = data["hashtags"].apply(get_hashtags)


In [210]:
H = data["hashtags"].values


In [211]:
H.shape

(9828,)

In [212]:
tfidf_hash = TfidfVectorizer(encoding='latin-1',binary=True, ngram_range=(1,1), min_df=2, use_idf=True, stop_words='english')


In [213]:
H_vec = tfidf_hash.fit_transform(H)


In [214]:
tfidf = TfidfVectorizer(encoding='latin-1',binary=True, ngram_range=(1,2), min_df=5, use_idf=True, stop_words='english')

In [215]:
X_vec = tfidf.fit_transform(X)


### columns to use

In [216]:
# X_vec , H_vec, followers_count , friends_count , user_id , verified_num

In [217]:
X_vec.shape

(9828, 3258)

In [218]:
A = coo_matrix(X_vec)
B = coo_matrix(H_vec)
C = coo_matrix(data["followers_count"].values)
D = coo_matrix(data["friends_count"].values)
E = coo_matrix(data["user_id"].values)
F = coo_matrix(data["verified_num"].values)


In [219]:
C.shape

(1, 9828)

In [220]:
tf_vectors = hstack([A,B,C.T,D.T,E.T,F.T]).toarray()


In [38]:
nb_clf= MultinomialNB()


In [39]:
nb_clf.fit(tf_vectors,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [41]:
nb_clf.score(tf_vectors,y)

0.45519713261648748

In [221]:
from sklearn.svm import SVC, LinearSVC

In [222]:
svc = SVC()

In [223]:
svc.fit(tf_vectors,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [225]:
svc.score(tf_vectors,y)

0.99806674806674811

## TEST

In [226]:
# read in the test data
data_test=pd.read_csv("../test_data/combine.csv", sep=",", lineterminator='\n') 

# preserve the id column of the test examples
#kaggle_ids=kaggle_test['PhraseId'].values

data_test = data_test[~(data_test["id"] == "id")]
data_test["verified_num"] = data_test["verified"].apply(lambda x: 1 if x == 'True' else 0)
data_test["followers_count"] = data_test["followers_count"].apply(convert_float)
data_test["user_id"] = data_test["user_id"].apply(convert_float)
data_test["friends_count"] = data_test["friends_count"].apply(convert_float)
data_test["verified_num"] = data_test["verified_num"].apply(convert_float)

data_test["hashtags"] = data_test["hashtags"].apply(get_hashtags)
H_test = data_test["hashtags"].values
X_test = data_test['text'].values

# vectorize the test examples using the vocabulary fitted from the 60% training data
Htest_vec=tfidf_hash.transform(H_test)
Xtest_vec=tfidf.transform(X_test)

A1 = coo_matrix(Xtest_vec)
B1 = coo_matrix(Htest_vec)
C1 = coo_matrix(data_test["followers_count"].values)
D1 = coo_matrix(data_test["friends_count"].values)
E1 = coo_matrix(data_test["user_id"].values)
F1 = coo_matrix(data_test["verified_num"].values)
tf_vectors1 = hstack([A1,B1,C1.T,D1.T,E1.T,F1.T]).toarray()

y1 = data_test['spam'].values

svc.score(tf_vectors1,y1)


0.89728562615669338