# Import Dependencies

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re 
import pickle
from wordcloud import WordCloud
from symspellpy import SymSpell
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from wordcloud import WordCloud
from collections import Counter

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

# Load Model

In [12]:
# Load the trained model
with open(r'models\logistic_model.pkl', 'rb') as f:
    loaded_model = pickle.load(f)

# Load the vectorizer used during training
with open(r'models\tfidf_vectorizer_logistic.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

# Load Dataset

In [13]:
test_df = pd.read_csv('test_tweets_anuFYb8.csv')
test_df.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedication #willpower to find #newmaterialsâ¦
1,31964,@user #white #supremacists want everyone to see the new â #birdsâ #movie â and hereâs why
2,31965,safe ways to heal your #acne!! #altwaystoheal #healthy #healing!!
3,31966,"is the hp and the cursed child book up for reservations already? if yes, where? if no, when? ððð #harrypotter #pottermore #favorite"
4,31967,"3rd #bihday to my amazing, hilarious #nephew eli ahmir! uncle dave loves you and missesâ¦"


In [14]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


# Preprocess

In [15]:
def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Yay -- ^^
    tweet = re.sub(r'(\^\^)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)    

    return tweet

def expand_contractions(tweet):
    tweet = re.sub(r"won\'t", "would not", tweet)
    tweet = re.sub(r"wont", "would not", tweet)
    tweet = re.sub(r"can\'t", "can not", tweet)
    tweet = re.sub(r"cant", "can not", tweet)
    tweet = re.sub(r"don\'t", "do not", tweet)
    tweet = re.sub(r"dont", "do not", tweet)
    tweet = re.sub(r"didn\'t", "did not", tweet)
    tweet = re.sub(r"didnt", "did not", tweet)
    tweet = re.sub(r"wouldn\'t", "would not", tweet)
    tweet = re.sub(r"wouldnt", "would not", tweet)
    tweet = re.sub(r"shouldn\'t", "should not", tweet)
    tweet = re.sub(r"shouldnt", "should not", tweet)
    tweet = re.sub(r"needn\'t", "need not", tweet)
    tweet = re.sub(r"neednt", "need not", tweet)
    tweet = re.sub(r"couldn\'t", "could not", tweet)
    tweet = re.sub(r"couldnt", "could not", tweet)
    tweet = re.sub(r"hasn\'t", "has not", tweet)
    tweet = re.sub(r"hasnt", "has not", tweet)
    tweet = re.sub(r"have\'nt", "have not", tweet)
    tweet = re.sub(r"was\'nt", "was not", tweet)
    tweet = re.sub(r"wasnt", "was not", tweet)
    tweet = re.sub(r"were'nt", "were not", tweet)
    tweet = re.sub(r"werent", "were not", tweet)
    tweet = re.sub(r"isn'\t", "is not", tweet)
    tweet = re.sub(r"isnt", "is not", tweet)
    tweet = re.sub(r"aren'\t", "are not", tweet)
    tweet = re.sub(r"arent", "are not", tweet)
    tweet = re.sub(r"ain't", "are not", tweet)
    tweet = re.sub(r"aint", "are not", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    tweet = re.sub(r"\'d", " would", tweet)
    tweet = re.sub(r"\'ll", " will", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    tweet = re.sub(r"\'ve", " have", tweet)
    tweet = re.sub(r'\bu\b', 'you', tweet, flags=re.IGNORECASE)

    return tweet

stop_words = stopwords.words('english')
stop_words.remove('not')
stop_words.remove('off')
stop_words = list(stop_words)

def remove_stopwords(tweet):
    clean_data = []
    for i in tweet.split():
        if i.strip() not in stop_words and i.strip().isalpha():
            clean_data.append(i.strip())
    return " ".join(clean_data)

def shorten_consecutive(tweet):
    tweet = re.sub(r"(.)\1\1+", r"\1\1", tweet)
    return tweet

wordStem = SnowballStemmer("english", ignore_stopwords=True)

def stem(tweet):
    stemmized_data = []
    for word in tweet.split():
        if len(word) > 1:
            word = wordStem.stem(word)
            stemmized_data.append(word)
    return " ".join(stemmized_data) 



def preprocess(df):
    df['tweet'] = df['tweet'].apply(lambda elem: re.sub(r"(@user)", "", elem))
    df['tweet'] = df['tweet'].apply(lambda elem: re.sub(r"(&amp\;)", "", elem))
    df['tweet'] = df['tweet'].apply(handle_emojis)
    df['tweet'] = df['tweet'].apply(expand_contractions)
    df['tweet'] = df["tweet"].str.replace(r"[^A-Za-z0-9 ]+", "", regex=True)
    df['tweet'] = df['tweet'].apply(remove_stopwords) 
    df['tweet'] = df['tweet'].apply(shorten_consecutive)
    df['tweet'] = df['tweet'].apply(stem)
    df['tweet'].fillna('None', inplace=True)

    return df

In [16]:
test_df = preprocess(test_df)

In [17]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17197 entries, 0 to 17196
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      17197 non-null  int64 
 1   tweet   17197 non-null  object
dtypes: int64(1), object(1)
memory usage: 268.8+ KB


# Apply Model

In [18]:
X_new_vect = loaded_vectorizer.transform(test_df['tweet'])
predictions = loaded_model.predict(X_new_vect)

In [19]:
solution = test_df.copy()
solution['label'] = predictions
solution = solution[['id', 'label']]

# Save as CSV file

In [20]:
solution.to_csv('test_predictions.csv', index=False)