# Training Logisitic Regression Model - 
Data - sentiment140 dataset. It contains 1,600,000 tweets extracted using the twitter api. The tweets have been annotated (0 = negative, 4 = positive).

It contains the following 6 fields:

1. target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

2. ids: The id of the tweet ( 2087)

3. date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

4. flag: The query (lyx). If there is no query, then this value is NO_QUERY.

5. user: the user that tweeted (robotickilldozr)

6. text: the text of the tweet (Lyx is cool)

Only fields used are 1 and 6. 

[sentiment140 link](https://www.kaggle.com/datasets/kazanova/sentiment140)

## Installs

In [None]:
!pip install pyspellchecker
!pip install pyspark
!pip install findspark
!pip install nltk
!pip install plotly



## Imports

In [None]:
#### for data manipulation and math operations ####
import pandas as pd
import numpy as np

#### for visualizations ####
# plotly
from plotly.offline import iplot
import plotly.graph_objs as go
from plotly.subplots import make_subplots

#### NLP packages ####
# NLTK library
from nltk.corpus import stopwords
# SKLearn 
from sklearn.feature_extraction.text import CountVectorizer
# py-spell checker
from spellchecker import SpellChecker


#### other useful packages ####
import string
from collections import Counter
import re
from tqdm import tqdm


#### Pyspark packages ####
import findspark
# findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier 
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/linuxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Proccessing the data

In [None]:
file_path = '/home/training.1600000.processed.noemoticon.csv'
colnames=['sentiment', 'ids', 'date', 'flag','user','text'] 
train = pd.read_csv(file_path,encoding = "ISO-8859-1", header=None, names=colnames, engine = 'python') 

In [None]:
train.head()

Unnamed: 0,sentiment,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
train = train[['text', 'sentiment']]

In [None]:
train.head()

Unnamed: 0,text,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [None]:
train.describe

<bound method NDFrame.describe of                                                       text  sentiment
0        @switchfoot http://twitpic.com/2y1zl - Awww, t...          0
1        is upset that he can't update his Facebook by ...          0
2        @Kenichan I dived many times for the ball. Man...          0
3          my whole body feels itchy and like its on fire           0
4        @nationwideclass no, it's not behaving at all....          0
...                                                    ...        ...
1599995  Just woke up. Having no school is the best fee...          4
1599996  TheWDB.com - Very cool to hear old Walt interv...          4
1599997  Are you ready for your MoJo Makeover? Ask me f...          4
1599998  Happy 38th Birthday to my boo of alll time!!! ...          4
1599999  happy #charitytuesday @theNSPCC @SparksCharity...          4

[1600000 rows x 2 columns]>

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   text       1600000 non-null  object
 1   sentiment  1600000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


In [None]:
train['sentiment'].unique()

array([0, 4])

### Cleaning Functions

In [None]:
def remove_HTML(text):
    """
    Inputs a string and outputs a string free of any HTML tags
    """
    tag = re.compile(r'<.*?>')
    
    return tag.sub(r'',text)

def remove_URL(text):
    """
    Inputs a string and outputs a string free of any URLs
    """
    url = re.compile(r'https?://\S+|www\.\S+')
    
    return url.sub(r'',text)

def remove_emojis(text):
    """
    Inputs a string and outputs a string free of any emojis
    """
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)
    
    return emoji.sub(r'',text)

def remove_punctuations(text):
    """
    Inputs a string and outputs a string free of any punctuations
    """
    punct = re.compile(r'[^\w\s]')
    
    return punct.sub(r'',text)

In [None]:
# set of all stopwords
stop = set(stopwords.words('english'))
stop.remove('not') # exclude not

def remove_stop_words(text):
    """
    inputs a text string and outputs a string without any stopwords
    """
    sentence = [] # list without any stopwords
    for word in text.split():
        if word not in stop:
            sentence.append(word)
            
    return " ".join(sentence)

In [None]:
def clean_text(text):
    """
    inputs a string:
    -------------------------------------
    outputs a string free from 
    1) html-tags
    2) urls
    3) emojis
    4) punctuations
    5) stopwords
    and lastly corrects the misspelled words
    """
    text = remove_HTML(text)
    text = remove_URL(text)
    text = remove_emojis(text)
    text = remove_punctuations(text)
    text = remove_stop_words(text)
    #text = correct_typo(text)
    
    return text

In [None]:
train['text'] = train['text'].apply(clean_text)

In [None]:
train

Unnamed: 0,text,sentiment
0,switchfoot Awww thats bummer You shoulda got D...,0
1,upset cant update Facebook texting might cry r...,0
2,Kenichan I dived many times ball Managed save ...,0
3,whole body feels itchy like fire,0
4,nationwideclass not behaving im mad I cant see,0
...,...,...
1599995,Just woke Having school best feeling ever,4
1599996,TheWDBcom Very cool hear old Walt interviews â,4
1599997,Are ready MoJo Makeover Ask details,4
1599998,Happy 38th Birthday boo alll time Tupac Amaru ...,4


In [None]:
train.groupby('sentiment', as_index=False).first()

Unnamed: 0,sentiment,text
0,0,switchfoot Awww thats bummer You shoulda got D...
1,4,I LOVE Health4UandPets u guys r best


## Spark Training

In [None]:
sqlContext = SQLContext(sc)

### Loading the cleaned text

In [None]:
file_path = '/home/cleaned_train.csv'
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load(file_path)
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
df.show()

+--------------------+------+
|               tweet|target|
+--------------------+------+
|dived many times ...|     0|
|  not the whole crew|     0|
|nope they did not...|     0|
|spring break in p...|     0|
|could not bear to...|     0|
|would ve been the...|     0|
|ahh ive always wa...|     0|
|was out most of t...|     0|
|baked you cake bu...|     0|
|blagh class at to...|     0|
|just going to cry...|     0|
|want to go to pro...|     0|
|ill tell ya the s...|     0|
|sorry bed time ca...|     0|
|bed class work gy...|     0|
|sad sad sad do no...|     0|
|just checked my u...|     0|
|oh man was ironin...|     0|
|is strangely sad ...|     0|
|oh so sorry did n...|     0|
+--------------------+------+
only showing top 20 rows



In [None]:
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed = 2000)

In [None]:
# Creating the pipeline for feature extraction

# tokenizing the data
tokenizer = Tokenizer(inputCol="tweet", outputCol="words")

# Creating an instance of the TF-IDF
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms

# to convert string target to index target
label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")

# the complete pipeline: sequence of various stages
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

### Training

In [None]:
pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)

In [None]:
val_df = pipelineFit.transform(val_set)
train_df.show(5)

+--------------------+------+--------------------+--------------------+--------------------+-----+
|               tweet|target|               words|                  tf|            features|label|
+--------------------+------+--------------------+--------------------+--------------------+-----+
|aa oh well atleas...|     0|[aa, oh, well, at...|(65536,[16030,166...|(65536,[16030,166...|  0.0|
|aa woo gutted tha...|     0|[aa, woo, gutted,...|(65536,[17603,204...|(65536,[17603,204...|  0.0|
|aaa yehyeh sumpah...|     0|[aaa, yehyeh, sum...|(65536,[2635,1576...|(65536,[2635,1576...|  0.0|
|aaaa do not like ...|     0|[aaaa, do, not, l...|(65536,[11650,126...|(65536,[11650,126...|  0.0|
|aaaa need to work...|     0|[aaaa, need, to, ...|(65536,[2973,7194...|(65536,[2973,7194...|  0.0|
+--------------------+------+--------------------+--------------------+--------------------+-----+
only showing top 5 rows



In [None]:
LR = LogisticRegression(maxIter=150)
model = LR.fit(train_df)
predictions = model.transform(val_df)

In [None]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.8454978856867478

### Saving the model

In [None]:
import os, tempfile
path = '/home/model2' 
model.save(path)

In [None]:
from pyspark.ml.classification import LogisticRegressionModel
newLR = LogisticRegressionModel.load(path)

### Test loading the model

In [None]:
predictionsNEW = newLR.transform(val_df)
evaluator.evaluate(predictionsNEW)

0.8454984040119407