In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp "/content/drive/My Drive/CS490A Tweet data/final_dataset.zip" .

In [None]:
import zipfile
with zipfile.ZipFile('final_dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

In [None]:
import os

PATH_TO_DATA = 'merged_no_dups'
BIDEN_LABEL = 'biden'
TRUMP_LABEL = 'trump'

biden_full_list = [os.path.join(PATH_TO_DATA, BIDEN_LABEL, file) for file in os.listdir(os.path.join(PATH_TO_DATA, BIDEN_LABEL))]
trump_full_list = [os.path.join(PATH_TO_DATA, TRUMP_LABEL, file) for file in os.listdir(os.path.join(PATH_TO_DATA, TRUMP_LABEL))]



In [None]:
!pip install Unidecode



In [None]:
import json
import random
import re
import pprint
import unidecode

def filterTweets(path, label):
  # searchFile = open('searchResults','a')
  skippedSearchCount = 0
  skippedDuplicateCount = 0
  skippedEmptyAfterFilter = 0
  tweetIds = {}
  tweets = []
  for file in path:
    if (file.endswith('.json')):
      data = json.load(open(file, encoding='utf-8'))
      tweet = data
      text = unidecode.unidecode(tweet['full_text'])
      text = text.lower()
      id = tweet['id']

      if id in tweetIds:
        skippedDuplicateCount += 1
        continue
      else:
        tweetIds[id] = True

      searches = tweet['search_keyword']

      retweet = False
      if 'retweeted_status' in tweet:
        retweet = True
        tweet = tweet['retweeted_status']
        text = unidecode.unidecode(tweet['full_text'])
        text = text.lower()

      foundSearch = False
      for search in searches:
        search = search.lower()
        temp = re.sub(search, '', text)
        if (temp != text):
          foundSearch = True
          text = temp

      if not foundSearch:
        if not retweet:
          skippedSearchCount += 1
          # print('(skipped) cant find search: ' + ''.join(searches) + ', file: ' + file + ', tweet: ' + text + '\n\n\n')
          continue
        else:
          # searchFile.write('(retweet) cant find search: ' + ''.join(searches) + ', file: ' + file + ', tweet: ' + text + '\n')
          tweet = data
          text = unidecode.unidecode(tweet['full_text'])
          text = text.lower()

        for search in searches:
          search = search.lower()
          temp = re.sub(search, '', text)
          if (temp != text):
            foundSearch = True
            text = temp

        
          if not foundSearch:
            skippedSearchCount += 1
            # print('(skipped) cant find search: ' + ''.join(searches) + ', file: ' + file + ', tweet: ' + text + '\n\n\n')
            continue
  
      # replace urls
      text = re.sub('http\S+', '', text)

      # replace hashtags
      text = re.sub('#[^a-zA-Z0-9]+', '', text)

      # replace users
      text = re.sub('@[^a-zA-Z0-9]+', '', text)

      if text == '':
        skippedEmptyAfterFilter += 1
        continue

      tweets.append((text, label, path))
  print('skipped (search keyword not found): ', skippedSearchCount)
  print('skipped (empty string after filter): ', skippedEmptyAfterFilter)
  print('skipped (duplicate found): ', skippedDuplicateCount)

  return tweets

print(BIDEN_LABEL)
bidenFiltered = filterTweets(biden_full_list, BIDEN_LABEL)
print('total: ', len(bidenFiltered))
tweets = bidenFiltered
print()
print(TRUMP_LABEL)
trumpFiltered = filterTweets(trump_full_list, TRUMP_LABEL)
print('total: ', len(trumpFiltered))
tweets += trumpFiltered
random.shuffle(tweets)

targetTypeCounts = {
    BIDEN_LABEL : [],
    TRUMP_LABEL : [],
}

data = []
targets = []
for tweet in tweets:
  data.append(tweet[0])
  targets.append(tweet[1])
  targetTypeCounts[tweet[1]].append(tweet[0])

biden
skipped (search keyword not found):  0
skipped (empty string after filter):  0
skipped (duplicate found):  0
total:  187435

trump
skipped (search keyword not found):  5
skipped (empty string after filter):  0
skipped (duplicate found):  0
total:  150296


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import TweetTokenizer
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

tknzr = TweetTokenizer()

for key, value in targetTypeCounts.items():
  targetCounts = CountVectorizer(tokenizer=tknzr.tokenize, stop_words=stopwords.words("english"))
  targetCounts.fit(value)
  listCounts = [(k, v) for k, v in targetCounts.vocabulary_.items()]
  listCounts.sort(key=lambda tup: tup[1], reverse=True)
  print('Top Features per candidate', key, ' : ', listCounts)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!






In [None]:
# get word counts for each document
vectorizer = CountVectorizer(tokenizer=tknzr.tokenize, stop_words=stopwords.words("english"))
vectorizer.fit(data)



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None,
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7fd03bc40748>>,
                vocabulary=None)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, recall_score, precision_score
from statistics import mean

kf = KFold(n_splits=5)
accuracyScoresTest = []
recallScoresTest = {TRUMP_LABEL: [], BIDEN_LABEL: []}
precisionScoresTest = {TRUMP_LABEL: [], BIDEN_LABEL: []}

accuracyScoresTrain = []
recallScoresTrain = {TRUMP_LABEL: [], BIDEN_LABEL: []}
precisionScoresTrain = {TRUMP_LABEL: [], BIDEN_LABEL: []}
count = 1
for train_index, test_index in kf.split(data):
  print(count)
  count += 1
  sentences_train = [data[index] for index in train_index]
  sentences_test = [data[index] for index in test_index]
  x_train = vectorizer.transform(sentences_train)
  x_test = vectorizer.transform(sentences_test)
  y_train = [targets[index] for index in train_index]
  y_test = [targets[index] for index in test_index]


  classifier = LogisticRegression(penalty='l2', max_iter=400000)
  classifier.fit(x_train, y_train)

  predictions = classifier.predict(x_test)
  accuracyScoresTest.append(accuracy_score(y_test, predictions))
  recallScore = recall_score(y_test, predictions, average=None, labels=list(set(predictions)))
  recallScoresTest[TRUMP_LABEL].append(recallScore[0])
  recallScoresTest[BIDEN_LABEL].append(recallScore[1])
  precisionScore = precision_score(y_test, predictions, average=None, labels=list(set(predictions)))
  precisionScoresTest[TRUMP_LABEL].append(precisionScore[0])
  precisionScoresTest[BIDEN_LABEL].append(precisionScore[1])

  predictionsTrain = classifier.predict(x_train)
  accuracyScoresTrain.append(accuracy_score(y_train, predictionsTrain))
  recallScore = recall_score(y_train, predictionsTrain, average=None, labels=list(set(predictionsTrain)))
  recallScoresTrain[TRUMP_LABEL].append(recallScore[0])
  recallScoresTrain[BIDEN_LABEL].append(recallScore[1])
  precisionScore = precision_score(y_train, predictionsTrain, average=None, labels=list(set(predictionsTrain)))
  precisionScoresTrain[TRUMP_LABEL].append(precisionScore[0])
  precisionScoresTrain[BIDEN_LABEL].append(precisionScore[1])

print('average accuracy test: ', mean(accuracyScoresTest))
print('average recall trump test: ', mean(recallScoresTest[TRUMP_LABEL]))
print('average recall biden test: ', mean(recallScoresTest[BIDEN_LABEL]))
print('average precision trump test: ', mean(precisionScoresTest[TRUMP_LABEL]))
print('average precision biden test: ', mean(precisionScoresTest[BIDEN_LABEL]))

print('average accuracy train: ', mean(accuracyScoresTrain))
print('average recall train trump: ', mean(recallScoresTrain[TRUMP_LABEL]))
print('average recall train biden: ', mean(recallScoresTrain[BIDEN_LABEL]))
print('average precision train trump: ', mean(precisionScoresTrain[TRUMP_LABEL]))
print('average precision train biden: ', mean(precisionScoresTrain[BIDEN_LABEL]))

1
2
3
4
5
average accuracy test:  0.916353545363537
average recall trump test:  0.8950075631068591
average recall biden test:  0.9334696660685814
average precision trump test:  0.9151586282281482
average precision biden test:  0.9172691679610011
average accuracy train:  0.947983009947562
average recall train trump:  0.9328354476939649
average recall train biden:  0.9601288940328196
average precision train trump:  0.9493936398692218
average precision train biden:  0.9468870440953647
