In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.externals import joblib
from sklearn.neural_network import MLPClassifier

In [2]:
df = pd.read_csv('train_tweets_full.csv',names = ["userID","tweets"])
df = df[pd.notnull(df['tweets'])]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327251 entries, 0 to 328931
Data columns (total 2 columns):
userID    327251 non-null int64
tweets    327251 non-null object
dtypes: int64(1), object(1)
memory usage: 7.5+ MB


In [3]:
col = ['userID', 'tweets']
df = df[col]
df.columns = ['userID', 'tweets']
df['category_id'] = df['userID'].factorize()[0]
category_id_df = df[['userID', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'userID']].values)
df.head()

Unnamed: 0,userID,tweets,category_id
0,8746,let's try and catch up live next week!,0
1,8746,going to watch grey's on the big screen - thur...,0
2,8746,my pleasure patrick....hope you are well!,0
3,8746,hi there! been traveling a lot and lots more t...,0
4,8746,rt looking to drink clean & go green? purchase...,0


In [4]:
#TF-IDF Settings
tfidf = TfidfVectorizer(analyzer='word', stop_words='english')

In [5]:
%%time
X_train = df['tweets']
y_train = df['userID']
# tfidf.fit(df['tweets'])

CPU times: user 313 µs, sys: 1 µs, total: 314 µs
Wall time: 324 µs


In [6]:
%%time
xtrain_tfidf =  tfidf.fit_transform(X_train)
count_vect = CountVectorizer(analyzer='word', stop_words='english')
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(xtrain_tfidf)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

CPU times: user 11.4 s, sys: 285 ms, total: 11.7 s
Wall time: 11.7 s


In [None]:
%%time
#Liner Model Logistic Regression
clf = LinearSVC().fit(x_train_tfidf, y_train)

In [None]:
%%time
#MLP Classifier
clf = MLPClassifier(activation='relu', solver='sgd', hidden_layer_sizes=(10,15), random_state=1)
clf.fit(X_train_tfidf,y_train)

In [None]:
print(clf.predict(count_vect.transform(["RT @handle: Director of Global Brand Marketing, Hotels and Casino's $125k + 30% bonus - Orlando Fl http://bit.ly/4kUmBB #jobs #twitjobs"])))


In [None]:
import re
def clean_tweet_text(tweet):
    text = re.sub(r'@\w+\s?', '', tweet)
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = re.sub('#\w+\s?', '', text)
    text = text.lower()
    return text

In [None]:
line_num = 1
with open('./whodunnit/test_tweets_unlabeled.txt') as un_fd:
    with open('result_lin_svc_tfidf_full.txt','w') as res:
        for line in un_fd.readlines():
            clean_line = clean_tweet_text(line)
            userID = clf.predict(count_vect.transform([clean_line])).tolist()
            print (userID)
            res.write("%s\t%s\n"%(line_num,userID[0]))
            line_num += 1