In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
labelencoder = LabelEncoder()

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [3]:
path='data/train_tweets.txt'
dtype={'tweet':'str'}
header=['author_Id', 'tweet']
data= pd.read_csv(path, sep='\t', names=header, dtype=dtype)
print(data.shape)
data.head(5)

(328195, 2)


Unnamed: 0,author_Id,tweet
0,8746,@handle Let's try and catch up live next week!
1,8746,Going to watch Grey's on the big screen - Thur...
2,8746,@handle My pleasure Patrick....hope you are well!
3,8746,@handle Hi there! Been traveling a lot and lot...
4,8746,RT @handle Looking to Drink Clean & Go Green? ...


In [20]:
y = data.author_Id
X = data.tweet
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=69)

train_data = pd.concat([X_train, y_train],axis=1)

class_group = train_data.groupby('author_Id')
class_size = class_group.size()
num_class = len(class_size)
print('----Original Training Data----')
print('Number of Classes:'+str(num_class))
print('Number of Tweets:'+str(len(train_data)))

reduced_data = train_data
#reduced_data = class_group.filter(lambda x: len(x) >= 10)
print('\n----Reduced Training Data----')
print('Number of Classes:'+str(len(reduced_data.groupby('author_Id').size())))
print('Number of Tweets:'+str(len(reduced_data)))

reduced_data.sort_values(by = "author_Id")
concatenated_data = reduced_data.groupby('author_Id')['tweet'].apply(' '.join).reset_index()
concatenated_data.sample(frac=1).reset_index(drop=True)
print('\n----Concatenated Reduced Training Data----')
print('Number of Classes:'+str(len(concatenated_data.groupby('author_Id').size())))
print('Number of Tweets:'+str(len(concatenated_data)))

----Original Training Data----
Number of Classes:9283
Number of Tweets:262556

----Reduced Training Data----
Number of Classes:9283
Number of Tweets:262556

----Concatenated Reduced Training Data----
Number of Classes:9283
Number of Tweets:9283


In [21]:
X_train = concatenated_data.tweet
y_train = concatenated_data.author_Id

In [27]:
def preprocess(data_line):
    data_line = re.sub('#', ' #hashtag ', data_line)
    data_line = re.sub(r'http\S+', '#http', data_line)
    data_line = re.sub(r'\d', ' #number ', data_line)
    data_line = re.sub(r'[:=;xX][oO0\-]?[dD\)\]\(\]/\\o0OpP]', ' #emoji ', data_line) 
    data_line = data_line.replace("\n","")
    for word in data_line.split():
        if word.isupper():
            data_line = data_line + ' #upper '
    signs = set('<^>-+@>,!?$)[](&%/*.:;"')
    prods = set(data_line) & signs
    if not prods:
      return data_line
    for sign in prods:
        data_line = data_line.replace(sign, ' {} '.format(sign))
    return [word.lower() for word in data_line.split()]

In [37]:
bow_transformer=CountVectorizer(analyzer=preprocess, max_features=3000).fit(X_train)
text_bow_train=bow_transformer.transform(X_train)
text_bow_dev=bow_transfo rmer.transform(X_dev)

input_dim = len(bow_transformer.get_feature_names())
print(bow_transformer.get_feature_names())
print(len(bow_transformer.get_feature_names()))

3000


In [38]:
method = 'SVM'

if method == 'NB':
  model = MultinomialNB()
if method == 'SVM':
  model = SVC(gamma='auto',verbose=3, C=1.0, kernel='linear')
  
print(method +' model is built.')
model = model.fit(text_bow_train, y_train)

SVM model is built.
[LibSVM]

In [25]:
model.score(text_bow_train, y_train)

0.07745340945814931

In [31]:
print(model)
print('8000 features')
print('All classes')
print('Val_acc: '+str(model.score(text_bow_dev, y_dev)))

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
8000 features
All classes
Val_acc: 0.07309678697116044


In [None]:
predict_y = model.predict(text_bow_dev)

In [34]:
def classifier_report(true_y, predict_y):
    
    print("The accuracy: ",accuracy_score(true_y, predict_y))
    print("The precision: ", precision_score(true_y, predict_y, average = "macro"))
    print("The recall: ", recall_score(true_y, predict_y, average = "macro"))
    print("The f1-score: ", f1_score(true_y, predict_y, average = "macro"))
    
    return 
classifier_report(y_dev,predict_y)

The accuracy:  0.07309678697116044
The precision:  0.0630055678313099
The recall:  0.0351348727758194
The f1-score:  0.03504775186062868


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [39]:
import pickle
filename = 'SVM_3000d_model.sav'
pickle.dump(model, open(filename, 'wb'))