In [0]:
#!pip install thundersvm

In [2]:
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
labelencoder = LabelEncoder()

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [5]:
path='train_tweets.txt'
dtype={'tweet':'str'}
header=['author_Id', 'tweet']
data= pd.read_csv(path, sep='\t', names=header, dtype=dtype)
print(data.shape)
data.head(5)

(328195, 2)


Unnamed: 0,author_Id,tweet
0,8746,@handle Let's try and catch up live next week!
1,8746,Going to watch Grey's on the big screen - Thur...
2,8746,@handle My pleasure Patrick....hope you are well!
3,8746,@handle Hi there! Been traveling a lot and lot...
4,8746,RT @handle Looking to Drink Clean & Go Green? ...


In [6]:
y = data.author_Id
X = data.tweet
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=69)

train_data = pd.concat([X_train, y_train],axis=1)

class_group = train_data.groupby('author_Id')
class_size = class_group.size()
num_class = len(class_size)
print('----Original Training Data----')
print('Number of Classes:'+str(num_class))
print('Number of Tweets:'+str(len(train_data)))

reduced_data = train_data
reduced_data = class_group.filter(lambda x: len(x) >= 10)
print('\n----Reduced Training Data----')
print('Number of Classes:'+str(len(reduced_data.groupby('author_Id').size())))
print('Number of Tweets:'+str(len(reduced_data)))

#reduced_data.sort_values(by = "author_Id")
#concatenated_data = reduced_data.groupby('author_Id')['tweet'].apply(' '.join).reset_index()
#concatenated_data.sample(frac=1).reset_index(drop=True)
#print('\n----Concatenated Reduced Training Data----')
#print('Number of Classes:'+str(len(concatenated_data.groupby('author_Id').size())))
#print('Number of Tweets:'+str(len(concatenated_data)))

----Original Training Data----
Number of Classes:9283
Number of Tweets:262556

----Reduced Training Data----
Number of Classes:8854
Number of Tweets:260306


In [9]:
X_train = reduced_data.tweet
y_train = reduced_data.author_Id

In [10]:
def preprocess(data_line):
  data_line = re.sub('#', ' #hashtag ', data_line)
  data_line = re.sub('@handle', ' #handle ', data_line)
  data_line = re.sub(r'http\S+', '#http', data_line)
  #data_line = re.sub(r'\d', r' \g[0] ', data_line)    
  data_line = data_line.replace("\n","")
  signs = set('<^>-+@>,!?$())[](&%/*.:;"')
  for word in data_line.split():
    if not re.search(r'^[:=;xX][oO0\-]?[dD\)\]\(\]/\\o0OpP]$', word):
      prods = set(word) & signs
      for sign in prods:
        data_line = data_line.replace(sign, ' {} '.format(sign))
  return [word.lower() for word in data_line.split()]

In [11]:
bow_transformer=CountVectorizer(analyzer=preprocess, max_features=5000).fit(X_train)
text_bow_train=bow_transformer.transform(X_train)
text_bow_dev=bow_transformer.transform(X_dev)

input_dim = len(bow_transformer.get_feature_names())
print(input_dim)

5000


In [12]:
method = 'SDG'

if method == 'NB':
  model = MultinomialNB()
  
if method == 'SVM':
  model = SVC(gamma='auto',verbose=2, C=1, kernel='linear')
  
if method == 'SDG':
  #parameters = {'panelty':['l2', 'l1', 'elasticnet']}
  sdg = SGDClassifier(early_stopping=True, verbose=0, n_jobs=-1)
  #sdg_gs = GridSearchCV(sdg, parameters)
  
print(method +' model is built.')

SDG model is built.


In [13]:
clf = sdg.fit(text_bow_train, y_train)
print(clf)

#best_parameters = sdg_gs.best_params_
#print(best_parameters)



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=True, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=-1, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)


In [None]:
predict_y = clf.predict(text_bow_dev)
print('Prediction has been completed.')

In [None]:
def classifier_report(true_y, predict_y):
    
    print("Accuracy: ",accuracy_score(true_y, predict_y))
    print("Precision: ", precision_score(true_y, predict_y, average = "macro"))
    print("Recall: ", recall_score(true_y, predict_y, average = "macro"))
    print("F1-score: ", f1_score(true_y, predict_y, average = "macro"))
    
    return 
  
classifier_report(y_dev, predict_y)

In [None]:
filename = '/content/drive/My Drive/Colab Notebooks/SDG_5000d_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [None]:
result = open("/content/drive/My Drive/Colab Notebooks/SDG_5000d_model.txt","w+")
result.write("Accuracy: ",accuracy_score(true_y, predict_y)) 
result.write("Rrecision: ", precision_score(true_y, predict_y, average = "macro"))
result.write("Recall: ", recall_score(true_y, predict_y, average = "macro"))
result.write("F1-score: ", f1_score(true_y, predict_y, average = "macro"))