# Logistic Regression

- __Model info:__
    - features:
        - contains negative words (T/F)
        - contains offensive words (T/F)
        - sentiment (+, -, 0)
        - contains emoji
    - model: Logistic Regression
    - max acc: 80% counter with ngrams(1, 3)
    
https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import collections

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import matthews_corrcoef
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer

In [2]:
#nltk.download(['stopwords'])

In [2]:
tweets_DF = pd.read_csv('../Dataset-OLID/OLIDv1.0/data_subtask_a.csv')
tweets_DF.head(10)

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,label_a
0,0,86426,@USER She should ask a few native Americans wh...,OFF,1
1,1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,1
2,2,16820,Amazon is investigating Chinese employees who ...,NOT,0
3,3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,1
4,4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,0
5,5,97670,@USER Liberals are all Kookoo !!!,OFF,1
6,6,77444,@USER @USER Oh noes! Tough shit.,OFF,1
7,7,52415,@USER was literally just talking about this lo...,OFF,1
8,8,45157,@USER Buy more icecream!!!,NOT,0
9,9,13384,@USER Canada doesn’t need another CUCK! We alr...,OFF,1


In [3]:
tweets_test_DF = pd.read_csv('../Dataset-OLID/OLIDv1.0/test_data_subtask_a.csv')
tweets_test_DF.head(5)

Unnamed: 0.1,Unnamed: 0,id,tweet,subtask_a,label_a
0,0,15923,#WhoIsQ #WheresTheServer #DumpNike #DECLASFISA...,OFF,1
1,1,27014,"#ConstitutionDay is revered by Conservatives, ...",NOT,0
2,2,30530,#FOXNews #NRA #MAGA #POTUS #TRUMP #2ndAmendmen...,NOT,0
3,3,13876,#Watching #Boomer getting the news that she is...,NOT,0
4,4,60133,#NoPasaran: Unity demo to oppose the far-right...,OFF,1


## 1. Preprocessing

In [4]:
import nltk
from nltk import word_tokenize
from nltk import ngrams
from nltk.tokenize.casual import TweetTokenizer

'''
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import WordPunctTokenizer
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
'''


'\nfrom nltk.corpus import stopwords\nfrom nltk.stem import PorterStemmer\nfrom nltk.tokenize import WordPunctTokenizer\nfrom nltk.collocations import BigramCollocationFinder\nfrom nltk.metrics import BigramAssocMeasures\n'

In [5]:
# remove unnecessery columns
tweets_DF = tweets_DF.rename(columns = {'label_a':'label'})
tweets_DF = tweets_DF.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)

tweets_test_DF = tweets_test_DF.rename(columns = {'label_a':'label'})
tweets_test_DF = tweets_test_DF.drop(["Unnamed: 0", "id", "subtask_a"], axis=1)


-----------------

Removing @USER

In [6]:
import re

In [7]:
REMOVE_USER = re.compile("@USER")

def remove_user(tweet):
    return REMOVE_USER.sub("", tweet)

In [8]:
tweets_DF['tweet'] = tweets_DF['tweet'].apply(remove_user)
tweets_test_DF['tweet'] = tweets_test_DF['tweet'].apply(remove_user)

-------------------------------
Extracting emoji

In [9]:
import emoji
import regex

In [10]:
def split_count(text):
    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI for char in word):
            emoji_list.append(word)

    return emoji_list

In [11]:
tweets_DF['emoji'] = tweets_DF['tweet'].apply(split_count)
tweets_test_DF['emoji'] = tweets_test_DF['tweet'].apply(split_count)

----------------------

### B) Sentiment

In [12]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [13]:
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    return [score['neg'], score['neu'], score['pos'], score['compound']]

In [14]:
tweets_DF['sentiment'] = tweets_DF['tweet'].apply(sentiment_analyzer_scores)
tweets_DF['negative_sentiment'] = tweets_DF['sentiment'].apply(lambda row: row[0])
tweets_DF['neutral_sentiment'] = tweets_DF['sentiment'].apply(lambda row: row[1])
tweets_DF['positive_sentiment'] = tweets_DF['sentiment'].apply(lambda row: row[2])
tweets_DF['compound'] = tweets_DF['sentiment'].apply(lambda row: row[3])

tweets_DF = tweets_DF.drop(["sentiment"], axis=1)

In [15]:
tweets_test_DF['sentiment'] = tweets_test_DF['tweet'].apply(sentiment_analyzer_scores)
tweets_test_DF['negative_sentiment'] = tweets_test_DF['sentiment'].apply(lambda row: row[0])
tweets_test_DF['neutral_sentiment'] = tweets_test_DF['sentiment'].apply(lambda row: row[1])
tweets_test_DF['positive_sentiment'] = tweets_test_DF['sentiment'].apply(lambda row: row[2])
tweets_test_DF['compound'] = tweets_test_DF['sentiment'].apply(lambda row: row[3])

tweets_test_DF = tweets_test_DF.drop(["sentiment"], axis=1)

-------------------------------
### C) Positive, negative and offensive words

In [18]:
from nltk import word_tokenize

In [19]:
negative_words_file = open("negative-words.txt")
negative_words = []

positive_words_file = open("positive-words.txt")
positive_words = []

for line in negative_words_file:
    if not str.startswith(line, ";"):
        negative_words.append(line.split("\n")[0])
        
negative_words = negative_words[1:]

for line in positive_words_file:
    if not str.startswith(line, ";"):
        positive_words.append(line.split("\n")[0])
        
positive_words = positive_words[1:]

In [56]:
def check_for_negative_words(tweet):
    tweet = preprocess_tweet(tweet)
    tweet = str.lower(tweet)
    words = word_tokenize(tweet)
    if any(word in negative_words for word in words):
        return True
    return False

def check_for_positive_words(tweet):
    tweet = preprocess_tweet(tweet)
    tweet = str.lower(tweet)
    words = word_tokenize(tweet)
    if any(word in positive_words for word in words):
        return True
    return False

In [57]:
tweets_DF['negative_words'] = tweets_DF['tweet'].apply(check_for_negative_words)
tweets_test_DF['negative_words'] = tweets_test_DF['tweet'].apply(check_for_negative_words)

tweets_DF['positive_words'] = tweets_DF['tweet'].apply(check_for_positive_words)
tweets_test_DF['positive_words'] = tweets_test_DF['tweet'].apply(check_for_positive_words)

In [22]:
offensive_words_file = open("facebook_bad_words.txt")
offensive_words = []

for line in offensive_words_file:
    offensive_words.append(line.strip())
        
offensive_words[0] in "baldjd 2g1c" 

True

In [23]:
def check_for_offensive_words(tweet):
    #tweet = preprocess_tweet(tweet)
    tweet = str.lower(tweet)
    if any(word in tweet for word in offensive_words):
        return True
    return False

In [24]:
tweets_DF['offensive_words'] = tweets_DF['tweet'].apply(check_for_offensive_words)
tweets_test_DF['offensive_words'] = tweets_test_DF['tweet'].apply(check_for_offensive_words)

----------------------------
### D) Removing stopwords, lemmatization

In [25]:
english_stop_words = stopwords.words('english')

In [26]:
lemmatizer = WordNetLemmatizer() #SnowballStemmer('english')
   
def preprocess_tweet(tweet, remove_stopwords=False):
    #remove all non letters
    regex = re.compile('[^a-zA-Z]')
    removed_nonalphanumeric = regex.sub(' ', tweet)
    lowercased_tweet = str.lower(removed_nonalphanumeric)

    #remove stopwords
    if remove_stopwords:
        lowercased_tweet = " ".join(word if word not in english_stop_words else "" for word in lowercased_tweet.split())
    
    #lemmatization
    lemmatized = ' '.join([lemmatizer.lemmatize(word) for word in lowercased_tweet.split()])
    return lemmatized
    

In [27]:
tweet = tweets_DF['tweet'][2]
print(tweet, "\n")
print(preprocess_tweet(tweet, False))
preprocess_tweet(tweet, True)

Amazon is investigating Chinese employees who are selling internal data to third-party sellers looking for an edge in the competitive marketplace. URL #Amazon #MAGA #KAG #CHINA #TCOT 

amazon is investigating chinese employee who are selling internal data to third party seller looking for an edge in the competitive marketplace url amazon maga kag china tcot


'amazon investigating chinese employee selling internal data third party seller looking edge competitive marketplace url amazon maga kag china tcot'

In [28]:
tweets_DF['clean_tweet_with_stopwords'] = tweets_DF['tweet'].apply(preprocess_tweet, False)
tweets_DF['clean_tweet_without_stopwords'] = tweets_DF['tweet'].apply(preprocess_tweet, True)

tweets_test_DF['clean_tweet_with_stopwords'] = tweets_test_DF['tweet'].apply(preprocess_tweet, False)
tweets_test_DF['clean_tweet_without_stopwords'] = tweets_test_DF['tweet'].apply(preprocess_tweet, True)

### E) Word embeddings

In [45]:
from gensim.models import KeyedVectors, word2vec
from gensim.test.utils import common_texts, get_tmpfile

In [51]:
corpus = tweets_DF['clean_tweet_with_stopwords'].values

# we need to pass splitted sentences to the model
tokenized_sentences = [sentence.split() for sentence in corpus]
model = word2vec.Word2Vec(tokenized_sentences, min_count=1)
#model.save("word2vec.model")

In [46]:
model = word2vec.Word2Vec.load("word2vec.model")
#model.train(corpus, total_examples=1000, epochs=10)
#model.save("word2vec.model")

In [47]:
model.vocabulary

<gensim.models.word2vec.Word2VecVocab at 0x7f2e2c0cacf8>

In [48]:
def embedd(sentance):
    tokens = sentance.split()
    summed_embedding = np.zeros(100)
    for token in tokens:
        if token in model.wv:
            summed_embedding += model.wv[token]
        
        
    return summed_embedding/len(tokens)

In [207]:
tknz = TweetTokenizer()
tweets_test_DF['clean_tweet_with_stopwords'] = tweets_test_DF['tweet'].apply(preprocess_tweet)#(tknz.tokenize).apply(" ".join)

In [208]:
tweets_DF['embedding'] = tweets_DF['clean_tweet_with_stopwords'].apply(embedd)
tweets_test_DF['embedding'] = tweets_test_DF['clean_tweet_with_stopwords'].apply(embedd)

## 2. Models
------------------------------------

### 2.1 Model: sentiment

In [58]:
#creating train, test 
sentiment = tweets_DF[["negative_sentiment", "negative_words", "offensive_words"]].values
y = tweets_DF['label'].values

X_test_sentiment = tweets_test_DF[["negative_sentiment", "negative_words", "offensive_words"]].values
y_test_sentiment = tweets_test_DF['label'].values

X_train_sentiment, X_val_sentiment, y_train_sentiment, y_val_sentiment = train_test_split(sentiment, y, test_size=0.2, random_state=1000)

In [63]:
classifier_sentiment = LogisticRegression(max_iter=1000, multi_class='multinomial')
classifier_sentiment.fit(X_train_sentiment, y_train_sentiment)
score = classifier_sentiment.score(X_test_sentiment, y_test_sentiment)

print("Accuracy:", score)

Accuracy: 0.7802325581395348


### 2.2 Model : tf_idf

In [80]:
tweets = tweets_DF['tweet'].values
y = tweets_DF['label'].values

tweets_test = tweets_test_DF['tweet'].values
y_test = tweets_test_DF['label'].values

vectorizer_tf = TfidfVectorizer()
vectorizer_tf.fit(tweets)

tweets_train, tweets_valid, y_train, y_valid = train_test_split(tweets, y, test_size=0.2, random_state=1000)

X_train_tfidf = vectorizer_tf.transform(tweets_train)
X_valid_tfidf = vectorizer_tf.transform(tweets_valid)
X_test_tfidf  = vectorizer_tf.transform(tweets_test)

In [81]:
classifier_tfidf = LogisticRegression(max_iter=100)
classifier_tfidf.fit(X_train_tfidf, y_train)
score = classifier_tfidf.score(X_test_tfidf, y_test)

print("Accuracy:", score)

Accuracy: 0.7906976744186046


### 2.3 Model: count_vector, clean data: char-based and word-based
- Old data voc size: 19083
- New data voc size: 16622

In [66]:
tweets = tweets_DF['clean_tweet_without_stopwords'].values
y = tweets_DF['label'].values

tweets_test = tweets_test_DF['clean_tweet_without_stopwords'].values
y_test = tweets_test_DF['label'].values

char_vectorizer = CountVectorizer(binary=True, analyzer='char' , ngram_range=(3, 5))
char_vectorizer.fit(tweets)

tweets_train, tweets_valid, y_train, y_valid = train_test_split(tweets, y, test_size=0.2, random_state=1000)

X_train_clean = char_vectorizer.transform(tweets_train)
X_valid_clean = char_vectorizer.transform(tweets_valid)
X_test_clean  = char_vectorizer.transform(tweets_test)

In [67]:
classifier_char = LogisticRegression(max_iter=1000)
classifier_char.fit(X_train_clean, y_train)
score = classifier_char.score(X_test_clean, y_test)

print("Accuracy:", score)

Accuracy: 0.7662790697674419


------------------------------

In [68]:
tweets = tweets_DF['clean_tweet_without_stopwords'].values
y = tweets_DF['label'].values

tweets_test = tweets_test_DF['clean_tweet_without_stopwords'].values
y_test = tweets_test_DF['label'].values

vectorizer = CountVectorizer(binary=True,  ngram_range=(1, 3))
vectorizer.fit(tweets)

tweets_train, tweets_valid, y_train, y_valid = train_test_split(tweets, y, test_size=0.2, random_state=1000)

X_train_clean = vectorizer.transform(tweets_train)
X_valid_clean = vectorizer.transform(tweets_valid)
X_test_clean  = vectorizer.transform(tweets_test)

In [69]:
classifier_clean = LogisticRegression(max_iter=1000)
classifier_clean.fit(X_train_clean, y_train)
score = classifier_clean.score(X_test_clean, y_test)

print("Accuracy:", score)

Accuracy: 0.8


----------------------------

In [157]:
X_train_clean = vectorizer.transform(tweets[:9000])
X_test_clean  = vectorizer.transform(tweets[9000:])
y_train = y[:9000]
y_test = y[9000:]

classifier_clean = LogisticRegression(max_iter=1000)
classifier_clean.fit(X_train_clean, y_train)
score = classifier_clean.score(X_test_clean, y_test)

print("Accuracy:", score)

Accuracy: 0.7599056603773585


Failed to predicit:

In [158]:
prediction = classifier_clean.predict(X_test_clean)
z = zip(prediction, y_test, range(9000, 9000+len(prediction)))
indices = np.array([i if p1!=p2 else -1 for p1, p2, i in z ])
indices = indices[indices>0]
for i in indices:
    print(tweets_DF['label'][i],tweets_DF['tweet'][i], tweets_DF['negative_words'][i],  "\n")

1  It bothers me because I feel whoever still supports him is just as racist as he is.  I'm half black and my daughters and daughter are black so I don't get how my family doesn't see anything wrong with supporting him. True 

1  We agree that Trump is a Nazi sympathizer. But the Sandy Hook Hoax and gun control is right out of Hitler's playbook. Bloomberg is a Nazi when it comes to gun control. Dems need to moderate!!!! True 

1  Innocent until proven guilty!         I think time has run out on this! And personnel I think she is lying! And someone is paying her to lie! JMO True 

1 😀😆😂  rocks #MAGA 👍 URL False 

1   How about really healing the country by leaving it and taking the rest of the loony liberals with you False 

0     Read  that I just posted. He got too close to Kavauagh and the Body Guard started into action to cover him. What was his intentions? He's a gun control nut grieving and blames Trump and Kavanaugh for his daughters death. His actions make my skin crawl too. Tru

1 #RedWave a coming!! That's ok tho. No conservative will harm you b/c of your political ideals.  Cant say the same for liberals tho. Liberals can be vile and nasty to opposing views. URL True 

1 The beauty of #TheView is that no one pretends not to be a leftist.   #maga False 

1  Can’t rehabilitate a pervert zero tolerance So disappointed in this pope True 

1   I know exactly how it works. They steal money through regulations and taxes and the people can't do a thing about it because the government has the monopoly on violence. Why do you think democrats desperately need gun control? They want even more power over the people. True 

1   We don't watch any NFL games. This guy can shove it in his pie hole. False 

1  They're all Democrats right?   They're not hiding it anymore! #FakeJournalists  #FakeNews  #NoHonor #MAGA False 

1 Lethargic voters get what they deserve! Aggressive voters MAGA URL True 

1    fails to realize he is the party of  and Antifa. A complete lack of self awa

0   The problem for May is how it will devastate her government &amp; party. The ERG will go bonkers &amp; try to stop a second vote. Result chaos.  Many Conservatives fear this will totally destroy the party. But repressing it WILL. True 

1  She is a parasitic lump False 

1  Carrey is becoming weird like peewee Herman... True 

1  And yes. Having less access to guns drops suicide rates and accidental injuried snd desths.  This can be proven by looking at every single country that has strict or better gun control.  Unkess you want to prove otherwise that these countries are just as bad. True 

1                                               He also clarified that there is the water birth when we are physically born. Then there is the spiritual birth which is obviously when we spiritually die to our sinful flesh and be reborn sinless after being cleansed by His blood at salvation when our sin debt is paid! Praise God True 

1      You are all rapid dog's 🤪 False 

1  Fudge report...th

### 2.4 Word embeddings

In [209]:
tweets = tweets_DF['embedding'].values
tweets = np.array([np.array(t) for t in tweets]).reshape(-1, 100)
y = tweets_DF['label'].values

tweets_test = tweets_test_DF['embedding'].values
X_test_embedding = np.array([np.array(t) for t in tweets_test]).reshape(-1, 100)
y_test = tweets_test_DF['label'].values

X_train_embedding, X_val_embedding, y_train, y_val = train_test_split(tweets, y, test_size=0.25, random_state=1000)

In [210]:
classifier_embedding = LogisticRegression(max_iter=1000)
classifier_embedding.fit(X_train_embedding, y_train)
score = classifier_embedding.score(X_test_embedding, y_test)

print("Accuracy:", score)

Accuracy: 0.7209302325581395


In [74]:
def ansamble(sentence):
    
    sentence = remove_user(sentence)
    neg, neu, pos, comp =  sentiment_analyzer_scores(sentence)
    negative_words = check_for_negative_words(sentence)
    offensive_words = check_for_offensive_words(sentence)
    
    preproc_with_stopwords = preprocess_tweet(sentence)
    preproc_no_stopwords = preprocess_tweet(sentence, True)
    
    #emb1 = embedd(preproc_with_stopwords)
    #emb2 = embedd(preproc_no_stopwords)
    
    score1 = classifier_sentiment.predict([[neg, negative_words, offensive_words]])
    
    tfidf = vectorizer_tf.transform([sentence])
    score2 = classifier_tfidf.predict(tfidf)
    
    count = vectorizer.transform([preproc_no_stopwords])
    score3 = classifier_clean.predict(count)
    
    count = char_vectorizer.transform([preproc_no_stopwords])
    score4 = classifier_char.predict(count)
    
    #score5 = classifier_embedding.predict([emb1])
    
    if not any(w in model.wv for w in preproc_no_stopwords.split()):# and score4[0]:
            return score4[0]
    
    vote = collections.Counter([score1[0], score2[0], score3[0], score4[0], score3[0]])
    if vote[0] > 3:
        return 0
    else:
        return 1

In [75]:
test_tweets = tweets_test_DF['tweet'].values
y = tweets_test_DF['label'].values

y_pred = [ansamble(t) for t in test_tweets]

accuracy_score(y, y_pred)

0.827906976744186

## 3.Test sentance

In [53]:
def test(sentence):
    label = ["NOT", "OFF"]
    
    sentence = remove_user(sentence)
    neg, neu, pos, comp =  sentiment_analyzer_scores(sentence)
    negative_words = check_for_negative_words(sentence)
    offensive_words = check_for_offensive_words(sentence)
    
    preproc_with_stopwords = preprocess_tweet(sentence)
    preproc_no_stopwords = preprocess_tweet(sentence, True)
    
    emb1 = embedd(preproc_with_stopwords)
    emb2 = embedd(preproc_no_stopwords)
    
    score1 = classifier_sentiment.predict([[neg, negative_words, offensive_words]])
    print("Features:    Classified as:", label[score1[0]])
    
    tfidf = vectorizer_tf.transform([sentence])
    score2 = classifier_tfidf.predict(tfidf)
    print("TF_idf:      Classified as:", label[score2[0]])
    
    count = vectorizer.transform([preproc_no_stopwords])
    score3 = classifier_clean.predict(count)
    print("Ngrams:      Classified as:", label[score3[0]])
    
    count = char_vectorizer.transform([preproc_no_stopwords])
    score4 = classifier_char.predict(count)
    print("Char-ngrams: Classified as:", label[score4[0]])
    
    score5 = classifier_embedding.predict([emb1])
    print("Embeddings:  Classified as:", label[score5[0]])
    
    score6 = ansamble(sentence)
    print("Ansamble:    Classified as:", label[score6])
    

In [507]:
test("Are you fuckisiiifn in?")
a = "Are you fuckisiiifn in?"
any([w in model.wv for w in a.split()])

Features:    Classified as: NOT
TF_idf:      Classified as: NOT
Ngrams:      Classified as: NOT
Char-ngrams: Classified as: OFF
Embeddings:  Classified as: NOT
Ansamble:    Classified as: OFF


True

---------------------------------
## 4. Testing correlation
__Tested correlation of some features and labels__

In [224]:
y_pred = tweets_DF['label'].values
y_negative_words = tweets_DF['negative_words'].values
y_offensive_words = tweets_DF['offensive_words'].values
y_negative_sentiment = tweets_DF['negative_sentiment'].values
y_negative_sentiment = [1 if neg > 0  else 0 for neg in y_negative_sentiment]

print("Tweet contains negative word: ", matthews_corrcoef(y_pred, y_negative_words))
print("Tweet has some negative sentiment: ", matthews_corrcoef(y_pred, y_negative_sentiment))
print("Tweet contains offensive words: ", matthews_corrcoef(y_pred, y_offensive_words))

Tweet contains negative word:  0.3317560038951808
Tweet has some negative sentiment:  0.302094438701342
Tweet contains offensive words:  0.2058708804504121


## TODO
https://docs.google.com/document/d/1OdniS8GEYwaFJy_zNC5GpXkceuGxmOtPK_dV9QAecho/edit

----------------------------

---------------------