In [26]:
# important imports
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score

# nltk imports for nlp
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import contractions
import spacy

# flask and joblib
from flask import Flask, request, jsonify
import joblib

In [3]:
df = pd.read_csv("tweets.csv")
df

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [4]:
df.drop(['id'],axis=1,inplace=True)

In [5]:
df

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,0,Finally a transparant silicon case ^^ Thanks t...
2,0,We love this! Would you go? #talk #makememorie...
3,0,I'm wired I know I'm George I was made that wa...
4,1,What amazing service! Apple won't even talk to...
...,...,...
7915,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,0,We would like to wish you an amazing day! Make...
7917,0,Helping my lovely 90 year old neighbor with he...
7918,0,Finally got my #smart #pocket #wifi stay conne...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7920 entries, 0 to 7919
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   7920 non-null   int64 
 1   tweet   7920 non-null   object
dtypes: int64(1), object(1)
memory usage: 123.9+ KB


In [7]:
# no nans so let's continue
df["label"].value_counts()

0    5894
1    2026
Name: label, dtype: int64

In [8]:
# we can easily notice that data not imbalanced
# i am gonna use smote to balance the data
# but first i will clean data and prepair it

In [9]:
data = df.iloc[:5000,:]
test_prod = df.iloc[5000:,:]
test_prod.to_csv('test.csv')

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\kai
[nltk_data]     ozwald\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
stop_words = set(stopwords.words('english'))
stop_words.remove("not") # as it is needed to be labeled as negative
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's'

In [12]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [13]:
# remove punctuations from the data
def remove_punctuation(tweet):
    tweet_nonpunct = "".join([new_tweet for new_tweet in tweet if new_tweet not in string.punctuation])
    return tweet_nonpunct

In [14]:
data["tweet"] = data["tweet"].apply(remove_punctuation,)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet"] = data["tweet"].apply(remove_punctuation,)


In [15]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to C:\Users\kai
[nltk_data]     ozwald\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\kai
[nltk_data]     ozwald\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\kai
[nltk_data]     ozwald\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [16]:
from spellchecker import SpellChecker
spell = SpellChecker()
def correct_spelling(text):
    corrected_text = []
    unknown_words = spell.unknown(text.split())
    for word in text.split():
        if word in unknown_words:
            corrected_text.append(word)  
        else:
            corrected_word = spell.correction(word)
            corrected_text.append(corrected_word)
    return " ".join(corrected_text)

In [17]:
# clean tweets func
import re
def cleaner(text):
#     text = re.sub(r"http\S+", "", text)
#     text = re.sub(r"(#\w+)", "", text)
#     text = re.sub(r"@[A-Za-z0-9]+", "", text)
#     text = re.sub(r" +", " ", text)
#     text = contractions.fix(text)
#     text = re.sub(r"[^a-zA-Z]", " ", text)
#     text = text.lower()
    tokens = word_tokenize(text)
#     tokens = [token for token in tokens if token not in stop_words]
#     corrected_tokens = [spell.correction(token) for token in tokens]
    filtered_tokens = [token for token in tokens if len(token)>2]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

In [18]:
data["tweet"] = data["tweet"].apply(cleaner)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["tweet"] = data["tweet"].apply(cleaner)


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer # turn text to numerical for smote to work proberly
from imblearn.over_sampling import SMOTE

X = data['tweet']
y = data['label']

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(X)

sm = SMOTE(random_state=42, k_neighbors=5)
X_res, y_res = sm.fit_resample(X_tfidf,y)

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)
regressor=LogisticRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [21]:
accuracy=accuracy_score(y_test,y_pred)
accuracy

0.9235924932975871

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[672,  67],
       [ 47, 706]], dtype=int64)

In [27]:
# Save the model and the TF-IDF vectorizer
joblib.dump(regressor, 'logistic_regression_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']