In [24]:
#Imports
import csv
import os
import glob
import json
import re
import string
import pandas as pd
import numpy as np
import scipy
from scipy import constants
import seaborn as sns

#Matplotlib
import matplotlib.pyplot as plt

#SciKit
from sklearn.datasets import load_iris
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score

#MODELS:
#Naive Bayes Classifiers
from sklearn.naive_bayes import GaussianNB, MultinomialNB

#Support Vector Classifier
from sklearn.svm import LinearSVC

#Decision Tree
from sklearn.tree import DecisionTreeClassifier

#NLTK
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer



In [26]:
#Extract features using count vectorizer
with open('./Tweet_Datasets/Classified_Tweets/Classified_Tweets.json', 'r', encoding='utf-8') as file:
   data = json.load(file)

#Extract TokenizedTweet values
tokenized_tweets = [entry["TokenizedTweet"] for entry in data]

#Create the CountVectorizer
cv = CountVectorizer(max_features=3000)
X_fin = cv.fit_transform([' '.join(inner_list) for inner_list in tokenized_tweets]).toarray()
feature_names = cv.get_feature_names_out()
bag_of_words_df = pd.DataFrame(X_fin, columns=feature_names)

#Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#Print the BOW matrix and export to CSV
print("\nBOW Matrix:")
print(bag_of_words_df)

#Export the BOW matrix to CSV
bag_of_words_df.to_csv('bag_of_words_matrix.csv', index=False)
print("\nBOW Matrix exported to 'bag_of_words_matrix.csv'")



BOW Matrix:
     abbreviation  abt  actor  actors  actual  actually  adult  african  \
0               0    0      0       0       0         0      0        0   
1               0    0      0       0       0         0      0        0   
2               0    0      0       0       0         0      0        0   
3               0    0      0       0       0         0      0        0   
4               0    0      0       0       0         0      0        0   
5               0    0      0       0       0         0      0        0   
6               0    0      0       0       0         0      0        0   
7               0    0      0       0       0         0      0        0   
8               0    0      0       0       0         0      0        0   
9               0    0      0       0       0         0      0        0   
10              0    0      0       0       0         0      0        0   
11              0    0      0       0       0         0      0        0   
12          

In [6]:
#Multinomial Naive Bayes Classifier
MultinomialNB = MultinomialNB()

#Gussian Naive Bayes Classifier
GaussianNB =  GaussianNB()

#Linear Support Vector Machine Classifier
LinearSVC = LinearSVC()

#Decision Tree Classifier
DecisionTreeClassifier = DecisionTreeClassifier()

In [7]:
with open('./Tweet_Datasets/Classified_Tweets/Classified_Tweets.json', 'r') as file:
   data = json.load(file)


classifications = [entry['Classification'] for entry in data]

X_train, X_test, y_train, y_test = train_test_split( 
                                               X_fin, 
                                               classifications, 
                                               test_size=0.2, 
                                                random_state=0)
MultinomialNB.fit(X_train, y_train)
GaussianNB.fit(X_train, y_train)
LinearSVC.fit(X_train, y_train)
DecisionTreeClassifier.fit(X_train, y_train)



In [8]:
#Multinomial Naive Bayes Results
MNBpred = MultinomialNB.predict(X_test)
print(classification_report(y_test, MNBpred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        41
           1       0.94      1.00      0.97        29

    accuracy                           0.97        70
   macro avg       0.97      0.98      0.97        70
weighted avg       0.97      0.97      0.97        70



In [9]:
#Gaussian Naive Bayes Results
GNBpred = GaussianNB.predict(X_test)
print(classification_report(y_test, GNBpred))

              precision    recall  f1-score   support

           0       1.00      0.63      0.78        41
           1       0.66      1.00      0.79        29

    accuracy                           0.79        70
   macro avg       0.83      0.82      0.79        70
weighted avg       0.86      0.79      0.78        70



In [10]:
#Linear Support Vector Machine Classifier Results
SVMpred = LinearSVC.predict(X_test)
print(classification_report(y_test, SVMpred))


              precision    recall  f1-score   support

           0       0.98      0.98      0.98        41
           1       0.97      0.97      0.97        29

    accuracy                           0.97        70
   macro avg       0.97      0.97      0.97        70
weighted avg       0.97      0.97      0.97        70



In [11]:
#Decision Tree Classifier Results
DTpred = DecisionTreeClassifier.predict(X_test)
print(classification_report(y_test, DTpred))

              precision    recall  f1-score   support

           0       0.91      0.95      0.93        41
           1       0.93      0.86      0.89        29

    accuracy                           0.91        70
   macro avg       0.92      0.91      0.91        70
weighted avg       0.91      0.91      0.91        70



In [None]:
#PREDICTION CODE

In [27]:
nltk.download('stopwords')
nltk.download('wordnet')

def cleanText(tweet):
    # Replaces URLs with a special token
    tweet = re.sub(r'((www.\S+)|(https?://\S+))', '<URL>', tweet)
    # Removes the numbers from the text
    tweet = re.sub(r'[0-9]\S+', r'', tweet)
    # Removes tags from tweets except for hashtags
    tweet = re.sub(r'(@\S+)', r'', tweet)
    # Removes the punctuation from the text except for #
    tweet_without_punctuation = re.sub(r'[^#\w\s]', r'', tweet)
    # Sets stop words
    stop_words = set(stopwords.words("english"))
    # Removes the stop words from tweet
    tweet_without_stopwords = [word for word in tweet_without_punctuation.split() if word.lower() not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tweet_without_stopwords]
    # Join the list of cleaned words into a single string
    cleaned_tweet = ' '.join(lemmatized_words)
    return cleaned_tweet

def predict_tweet_classification(tweet):
    # Preprocess the tweet
    cleaned_tweet = cleanText(tweet)
    # Vectorize the preprocessed tweet
    tweet_vectorized = cv.transform([cleaned_tweet])
    # Predict the classification using the trained model
    predicted_class = MultinomialNB.predict(tweet_vectorized)  # Corrected
    return predicted_class[0]  # Return the predicted class

# Example usage
tweet = "This is a sample tweet."
predicted_class = predict_tweet_classification(tweet)
print("Predicted class:", predicted_class)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joeyholzman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joeyholzman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


TypeError: _BaseNB.predict() missing 1 required positional argument: 'X'