In [1]:
import sklearn
import os
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split
import tensorflow as tf

from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

import pandas as pd
import re as rgx
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shakabrah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print(f'Found Gpu at: {device_name}')

Found Gpu at: /device:GPU:0


2022-01-06 16:18:07.864259: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-06 16:18:07.895246: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-06 16:18:07.931335: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-06 16:18:07.931730: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:939] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA 

In [3]:
dataset_path = '../../datasets/training.1600000.processed.noemoticon.csv'

cols = ['sentiment', 'id', 'date', 'query_string', 'user', 'comment_text' ]


df = pd.read_csv(dataset_path, header=None, encoding='latin-1', names = cols)
df.head()

Unnamed: 0,sentiment,id,date,query_string,user,comment_text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
def hashtags(text):
    """
    text : String
    return : String List -> ona verilen text'in içindeki hashtagleri liste halinde döndürür.
    """
    return str(rgx.findall(r'#(\w+)', text))

In [5]:
def emoji(text):
    for emot in UNICODE_EMOJI:
        if text == None:
            text= text
        else:
            text = text.replace(emot, "_".join(UNICODE_EMOJI[emot].replace(",", "").replace(":", "").split()))
    return text

In [6]:
def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = rgx.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) 
    # remove tweeted at
    return tweet

In [7]:
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = rgx.sub(r'http\S+', '', tweet) # remove http links
    tweet = rgx.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

In [8]:
def clean_html(text):
  html = rgx.compile('<.*?>')#regex
  return html.sub(r'',text)

In [9]:
def non_ascii(s):
  return "".join(i for i in s if ord(i)<128)

def lower(text):
  return text.lower()

In [10]:
def removeStopWords(str):
#select english stopwords
  cachedStopWords = set(stopwords.words("english"))
#add custom words
  cachedStopWords.update(('and','I','A','http','And','So','arnt','This','When','It','many','Many','so','cant','Yes','yes','No','no','These','these','mailto','regards','ayanna','like','email'))
#remove stop words
  new_str = ' '.join([word for word in str.split() if word not in cachedStopWords]) 
  return new_str

In [11]:
def email_address(text):
  email = rgx.compile(r'[\w\.-]+@[\w\.-]+')
  return email.sub(r'',text)

def punct(text):
  token=RegexpTokenizer(r'\w+')#regex
  text = token.tokenize(text)
  text= " ".join(text)
  return text 

In [12]:
def remove_digits(text):
    pattern = r'[^a-zA-z.,!?/:;\"\'\s]' 
    return rgx.sub(pattern, '', text) 

def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return rgx.sub(pat, '', text)

def remove_(tweet):
    tweet = rgx.sub( '([_]+)', '', tweet )
    return tweet

In [13]:

#df['clean_tweet'] = df.comment_text.apply(func = emoji)
df['clean_tweet'] = df.comment_text.apply(func = remove_users)
df['clean_tweet'] = df.clean_tweet.apply(func = clean_html)
df['clean_tweet'] = df.clean_tweet.apply(func = non_ascii)
df['hashtag'] = df.clean_tweet.apply(func = hashtags)
df['clean_tweet'] = df.clean_tweet.apply(func = lower)
df['clean_tweet'] = df.clean_tweet.apply(func = email_address)
df['clean_tweet'] = df.clean_tweet.apply(func = removeStopWords)
df['clean_tweet'] = df.clean_tweet.apply(func = clean_html)
df['clean_tweet'] = df.clean_tweet.apply(func = punct)
df['clean_tweet'] = df.clean_tweet.apply(func = remove_)


In [14]:
df.sentiment = df.sentiment.astype('category')
df.sentiment = df.sentiment.cat.codes

In [15]:
df.sentiment.value_counts()

0    800000
1    800000
Name: sentiment, dtype: int64

In [16]:
df.columns

Index(['sentiment', 'id', 'date', 'query_string', 'user', 'comment_text',
       'clean_tweet', 'hashtag'],
      dtype='object')

In [17]:
df_train, df_test = train_test_split(df, test_size = 0.3, stratify= df['sentiment'], random_state=21)
print(df_train.shape, df_test.shape)

(1120000, 8) (480000, 8)


In [18]:
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)
tfidf_vectorizer.fit(df_train.clean_tweet)

TfidfVectorizer(max_features=1000,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}))

In [19]:
train_idf = tfidf_vectorizer.transform(df_train.clean_tweet)
test_idf = tfidf_vectorizer.transform(df_test.clean_tweet)

In [20]:
train_idf

<1120000x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 4351089 stored elements in Compressed Sparse Row format>

In [21]:
model_rf = RandomForestClassifier(n_estimators=20, n_jobs=-1)
model_rf.fit(train_idf, df_train.sentiment)
predict_train = model_rf.predict(train_idf)
predict_test = model_rf.predict(test_idf)

In [22]:
import pickle

pickle.dump(model_rf, open('../../saved_models/twittersentiment.model', 'wb'))

In [28]:
print('train precision :',sklearn.metrics.precision_score(y_true= df_train.sentiment, y_pred = predict_train))
print('train recall: ',sklearn.metrics.recall_score(y_true= df_train.sentiment, y_pred = predict_train))
print('train f1:',sklearn.metrics.f1_score(y_true= df_train.sentiment, y_pred = predict_train))

train precision : 0.8979591144574738
train recall:  0.945575
train f1: 0.9211521316523079


In [29]:
print('test precision :',sklearn.metrics.precision_score(y_true= df_test.sentiment, y_pred = predict_test))
print('test recall: ',sklearn.metrics.recall_score(y_true= df_test.sentiment, y_pred = predict_test))
print('test f1:',sklearn.metrics.f1_score(y_true= df_test.sentiment, y_pred = predict_test))

test precision : 0.7165816602928606
test recall:  0.7440416666666667
test f1: 0.730053536714248


In [31]:
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True, max_features=1000, stop_words= ENGLISH_STOP_WORDS)),
                            ('model', RandomForestClassifier(n_estimators= 5, n_jobs=-1))
                            ])

pipeline.fit(df_train.clean_tweet, df_train.sentiment)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_features=1000,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterwards', 'again',
                                                       'against', 'all',
                                                       'almost', 'alone',
                                                       'along', 'already',
                                                       'also', 'although',
                                                       'always', 'am', 'among',
                                                       'amongst', 'amoungst',
                                                       'amount', 'an', 'and',
                                                       'another', 'any',
                                                       'anyhow', 'anyone',
           