# Capstone Project

## Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from spellchecker import SpellChecker
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import json
import ast

!pip install pyspellchecker==0.5.6

from spellchecker import SpellChecker

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

spell = SpellChecker()
lmtzr = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

## Functions to handle recurring strings

In [None]:
def remove_stopwords_from_stringified_list(words):
    word_list = ast.literal_eval(words)
    filtered_words = [word for word in word_list if word.lower() not in stop_words]
    return filtered_words

def remove_stopwords(word_list):
    filtered_words = [word for word in word_list.split(" ") if word.lower() not in stop_words]
    return filtered_words

def isInteger(s):
    try: 
        int(s)
    except ValueError:
        return False
    else:
        return True

def remove_numerals(words):
    word_list = words.split(" ")
    filtered_words = [word for word in word_list if not isInteger(word)]
    return " ".join(filtered_words)

def remove_users(word):
    filtered_word = ""
    for check in word.split():
        if "@" not in check:
            filtered_word += check
            filtered_word += " "
    return filtered_word

def spell_check(word_list):
    filtered_words = [word if spell.correction(word) is None else spell.correction(word) for word in word_list]
    print(filtered_words)
    return filtered_words

def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lmtzr_data(word_list):
    long_word = ""
    for word in word_list:
        long_word = long_word + word + " "
    pos_tagged = nltk.pos_tag(nltk.word_tokenize(long_word))  
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tagged))
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            lemmatized_sentence.append(word)
        else:
            lemmatized_sentence.append(lmtzr.lemmatize(word, tag))
    lemmatized_sentence = " ".join(lemmatized_sentence)
    return lemmatized_sentence

def listToString(list):
    return " ".join(list)

## Preparing the data

In [None]:
df = pd.read_csv("labeled_data.csv")


### Cleaning the data

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
df["tweet"] = df["tweet"].apply(remove_users)

df["tokenized_tweet"] = df["tweet"].map(tokenizer.tokenize)

df["tokenized_tweet"] = df["tokenized_tweet"].apply(listToString)

df["stopword_removed_tweet"] = df["tokenized_tweet"].apply(remove_stopwords)

df["spell_fixed_tweet"] = df["stopword_removed_tweet"].apply(lambda x : spell_check(x))

### Saving the clean data

In [None]:
new_df = pd.read_excel("spell_checked_data.xlsx")

print(new_df)

### Using Lemmatization to get the root of the words

In [None]:
print(new_df.columns)

new_df["lemmatized_tweet"] = new_df["spell_fixed_tweet"].apply(lmtzr_data)

new_df["lemmatized_tweet"].apply(remove_numerals)

new_df.to_excel("spell_checked_data.xlsx", index = False)

### Splitting the training and the test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_df['lemmatized_tweet'], new_df['class'], test_size=0.2, random_state=42)

### Using TfidfVectorizer to transform the data into vectors

In [None]:
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))

### Pipelines

In [None]:
pipeline_lr = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', LogisticRegression(max_iter=1000))
])

pipeline_svc = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', SVC())
])

pipeline_rf = Pipeline([
    ('tfidf', vectorizer),
    ('classifier', RandomForestClassifier())
])

### Cross-Validation

In [None]:
def custom_cross_validation(pipeline, X, y):
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    print("Cross-validation scores:", scores)
    print("Mean Accuracy:", scores.mean())
    print("Standard Deviation of Accuracy:", scores.std())

# Performing cross-validation for each pipeline
print("Logistic Regression:")
custom_cross_validation(pipeline_lr, X_train, y_train)

print("SVM:")
custom_cross_validation(pipeline_svc, X_train, y_train)

print("Random Forest:")
custom_cross_validation(pipeline_rf, X_train, y_train)

### Using Logistic regression to train the model

In [None]:
pipeline_lr.fit(X_train, y_train)
y_pred_lr = pipeline_lr.predict(X_test)

### Using Support Vector Classification to train the model

In [None]:
pipeline_svc.fit(X_train, y_train)
y_pred_svc = pipeline_svc.predict(X_test)

### Using Random Forest to train the model

In [None]:
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)

### Evaluating the prediction results

In [None]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))