In [94]:
# Sentiment analysis courtesy of https://www.digitalocean.com/community/tutorials/how-to-perform-sentiment-analysis-in-python-3-using-the-natural-language-toolkit-nltk
# Trendline for dates thanks to https://stackoverflow.com/questions/70852986/add-trend-line-to-datetime-matplotlib-line-graph
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import twitter_samples, stopwords
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier
import snscrape
import pandas as pd
import re, string, random
import os
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import matplotlib.dates as dates

def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

def get_all_words(cleaned_tokens_list):
    for tokens in cleaned_tokens_list:
        for token in tokens:
            yield token

def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

if __name__ == "__main__":

    positive_tweets = twitter_samples.strings('positive_tweets.json')
    negative_tweets = twitter_samples.strings('negative_tweets.json')
    text = twitter_samples.strings('tweets.20150430-223406.json')
    tweet_tokens = twitter_samples.tokenized('positive_tweets.json')[0]

    stop_words = stopwords.words('english')

    positive_tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
    negative_tweet_tokens = twitter_samples.tokenized('negative_tweets.json')

    positive_cleaned_tokens_list = []
    negative_cleaned_tokens_list = []

    for tokens in positive_tweet_tokens:
        positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    for tokens in negative_tweet_tokens:
        negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

    all_pos_words = get_all_words(positive_cleaned_tokens_list)

    freq_dist_pos = FreqDist(all_pos_words)
    print(freq_dist_pos.most_common(10))

    positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
    negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

    positive_dataset = [(tweet_dict, "Positive")
                         for tweet_dict in positive_tokens_for_model]

    negative_dataset = [(tweet_dict, "Negative")
                         for tweet_dict in negative_tokens_for_model]

    dataset = positive_dataset + negative_dataset

    random.shuffle(dataset)

    train_data = dataset[:7000]
    test_data = dataset[7000:]

    classifier = NaiveBayesClassifier.train(train_data)

    print("Accuracy is:", classify.accuracy(classifier, test_data))

    print(classifier.show_most_informative_features(10))

    custom_tweet = "I ordered just once from TerribleCo, they screwed up, never used the app again."

    custom_tokens = remove_noise(word_tokenize(custom_tweet))

    print(custom_tweet, classifier.classify(dict([token, True] for token in custom_tokens)))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]
Accuracy is: 0.9963333333333333
Most Informative Features
                      :( = True           Negati : Positi =   2071.1 : 1.0
                      :) = True           Positi : Negati =    993.6 : 1.0
                     bam = True           Positi : Negati =     21.7 : 1.0
                 welcome = True           Positi : Negati =     20.6 : 1.0
                     sad = True           Negati : Positi =     20.1 : 1.0
                    poor = True           Negati : Positi =     17.0 : 1.0
                     ugh = True           Negati : Positi =     15.0 : 1.0
               community = True           Positi : Negati =     14.3 : 1.0
                followed = True           Negati : Positi =     14.2 : 1.0
                  arrive = True           Positi : Negati =     14.2 : 1.0
None
I ordered just once from TerribleCo, 

In [97]:
pos_cnt = 0
tweet_cnt = 0
tweet_classed = pd.DataFrame(columns=['file','tweet','sent'])
graph_data = []
for root, dirs, files in os.walk('/Users/jj/PyProjects/CSI4240/monday.com_hack_data/',topdown=False):
    for file in files:
        if(file.endswith('.json') and os.stat(os.path.join(root, file)).st_size > 0):
            pos_cnt=0
            df = pd.read_json(os.path.join(root, file), lines=True)
            file.replace('text-query-tweets','')
            file.replace('.json','')
            for i in range(len(df)):
                tweet_classed.at[tweet_cnt, ['file']] = file
                tweet_classed.at[tweet_cnt, ['tweet']] = df.at[i,'content']
                custom_tokens = remove_noise(word_tokenize(df.at[i,'content']))
                tweet_classed.at[tweet_cnt, ['tokenized']] = ' '.join([str(item) for item in remove_noise(word_tokenize(df.at[i,'content']))])
                tweet_classed.at[tweet_cnt, ['sent']] = classifier.classify(dict([token, True] for token in custom_tokens))
                if classifier.classify(dict([token, True] for token in custom_tokens)) == 'Positive':
                    pos_cnt = pos_cnt +1
                tweet_cnt = tweet_cnt + 1
            result = pos_cnt/len(df)
            graph_data.append((file,result))
tweet_classed.to_csv('/Users/jj/PyProjects/CSI4240/monday.com_hack_data/classed_tweets_data1.csv', index=False)
df = pd.DataFrame(graph_data, columns=['d', 'value'])
df = df.replace(to_replace='text-query-tweets',value='',regex=True)
df = df.replace(to_replace='.json',value='',regex=True)
df['d'] = pd.to_datetime(df['d'],format='%Y-%m-%d')
df.to_csv('/Users/jj/PyProjects/CSI4240/monday.com_hack_data/df.csv')

In [96]:
display(df)

Unnamed: 0,d,value
0,2021-04-05,0.625
1,2021-04-13,0.464286
2,2021-04-29,0.441176
3,2021-04-09,0.428571
4,2021-04-25,0.545455
5,2021-04-24,0.368421
6,2021-04-08,0.652174
7,2021-04-28,0.607143
8,2021-04-12,0.823529
9,2021-04-04,0.8
