In [43]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import random
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import numpy as np
from sklearn.linear_model import LogisticRegression

In [56]:
class LogisticRegression_:


    def __init__(self, alpha: float = 0.17):
        self.alpha = alpha
        pass

    def fit(self, X : np.array, y : np.array):

        self.n_feactures = X.shape[1]
        self.theta = np.random.randn(self.n_feactures)
        self.length = X.shape[0]
        sum = np.zeros(self.n_feactures)
        for i in range(self.length):
            sum = self.update_sum(sum, X, y, i)
        
        sum = -self.alpha* sum/self.length
        
        self.theta = np.add(self.theta, sum)


    def update_sum(self, sum : np.array, X : np.array, y : np.array, line: int):
        for i in range(self.n_feactures):
            sum[i] += (self.sigmoid(np.sum(np.multiply(X[line, :],self.theta))) - y[line]) * X[line, i]

        return sum

    def sigmoid(self, x):
        return 1/(1 + np.exp(-1*x))


    def predict(self, X: np.array):

        y = np.zeros(X.shape[0])
        for line in range(X.shape[0]):
            y[line] = self.sigmoid(np.multiply(X[line, :],  self.theta))

        return y

In [5]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [11]:
all_disaster_tweets = df[df["target"] == 1]["text"].to_list()
all_non_disaster_tweets = df[df["target"] == 0]["text"].to_list()
tweets = all_disaster_tweets + all_non_disaster_tweets
labels = np.append(np.ones((len(all_disaster_tweets))), np.zeros((len(all_non_disaster_tweets))))

In [9]:
stopwords_english = stopwords.words('english') 
def process_tweet(tweet):
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet = tokenizer.tokenize(tweet)
    tweets_clean = []

    for word in tweet: # Go through every word in your tokens list
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
            tweets_clean.append(word)

    stemmer = PorterStemmer() 
    tweets_stem = [] 

    for word in tweets_clean:
        stem_word = stemmer.stem(word)  # stemming word
        tweets_stem.append(stem_word)  # append to the list
    
    return tweets_stem    

def build_freqs(tweets, ys):
    freqs = {}
    yslist = np.squeeze(ys).tolist()

    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            freqs[pair] = freqs.get(pair, 0) + 1

    return freqs

In [12]:
freqs = build_freqs(tweets, labels)

# Logistic Regression

In [13]:
train_dis = all_disaster_tweets[:2500]
train_non = all_non_disaster_tweets[:3000]
train_tweets = train_dis + train_non
labels = np.append(np.ones((len(train_dis))), np.zeros((len(train_non))))
bias = np.ones(len(labels))
logistic_features = pd.DataFrame({'bias': bias})

In [14]:
def process_logistic_regression(tweet, freqs):
    non_dissaster_sum = 0
    dissaster_sum = 0
    for word in process_tweet(tweet):
        pair = (word, 0)
        non_dissaster_sum += freqs.get(pair, 0)
        pair = (word, 1)
        dissaster_sum += freqs.get(pair, 0)

    return dissaster_sum, non_dissaster_sum

In [15]:
dissaster_list = []
non_dissaster_list = []
for tweet in train_tweets:
    process_logistic = process_logistic_regression(tweet, freqs)
    dissaster_list.append(process_logistic[0])
    non_dissaster_list.append(process_logistic[1])

In [16]:
logistic_features['dissaster'] = dissaster_list
logistic_features['non_dissaster'] = non_dissaster_list
logistic_features['label'] = labels

In [17]:
logistic_features.head(10)

Unnamed: 0,bias,dissaster,non_dissaster,label
0,1.0,149,141,1.0
1,1.0,393,117,1.0
2,1.0,248,123,1.0
3,1.0,425,143,1.0
4,1.0,192,211,1.0
5,1.0,621,164,1.0
6,1.0,582,221,1.0
7,1.0,996,840,1.0
8,1.0,356,227,1.0
9,1.0,778,716,1.0


In [19]:
X = logistic_features[['bias', 'dissaster', 'non_dissaster']].values # Get only the numerical values of the dataframe
y = logistic_features['label'].values; # Put in Y the corresponding labels or sentiments

print(X.shape) # Print the shape of the X part
print(X) # Print some rows of X

(5500, 3)
[[  1. 149. 141.]
 [  1. 393. 117.]
 [  1. 248. 123.]
 ...
 [  1.  73. 300.]
 [  1. 169. 358.]
 [  1.  15.  89.]]


In [44]:
clf = LogisticRegression().fit(X, y)

In [52]:
theta = clf.coef_[0]
theta, np.sqrt(theta[0]**2 + theta[1]**2  + theta[2]**2)

(array([-0.10902314,  0.0078544 , -0.00800717]), 0.10959858938029954)

In [59]:
test = LogisticRegression_(alpha=0.1)
test.fit(X, y)

In [60]:
test.theta

array([ 1.35769027, -8.46564723, -8.58409381])