# Analysis of Tweets

# Import necessary modules

In [3]:
import csv, re, math #used throughout
from collections import Counter #used in jaccard
from __future__ import division #used in TF_IDF
import string #_used in TF_IDF
import os   #used to check for existence of files
import statistics #used to calculate means
#from itertools import groupby #used in early attempt   at grouping keywords
import operator  #use in to count key words   

## Load tweets from .csv file

## Arrange Tweets by Date and Strip URLs

In [6]:
inputfilename = 'realFinalNHB.csv'

tweetList = []
with open(inputfilename, 'r', encoding='latin-1') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        tweetList.append(row)


def remURL(tweetTxt):
    url="https*\S*" #many of the tweets contain no colon or slashes after http[s]
    return re.sub(url, '', tweetTxt)

allTweetsByDate={}
allTweetsByDate[tweetList[0]['publicationTime'][:8]] = []
for dic in tweetList:
    dateT=dic['publicationTime'][:8] 
    if dateT in allTweetsByDate.keys():
        allTweetsByDate[dateT].append(dic['bodyText'])
    else:
        allTweetsByDate[dateT]=[dic['bodyText']]
for key in allTweetsByDate.keys():
    for  n in range(len(allTweetsByDate[key])):
        allTweetsByDate[key][n]= remURL(allTweetsByDate[key][n])


## Pairwise comparison function. Used for both cosine and Jaccard

In [7]:
def pairwiseCompare (compFunction, stringList):
    resultMatrix = []
    for s in stringList:
        resultList = []
        for S in stringList:
            resultList.append(compFunction(s,S))
        resultMatrix.append(resultList)
    return resultMatrix

##  Function to calculate cosine similarity

In [8]:


WORD = re.compile(r'\w+')

def get_cosine(string1, string2):
     vec1 = text_to_vector(string1)
     vec2 = text_to_vector(string2)
     intersection = set(vec1.keys()) & set(vec2.keys())
     numerator = sum([vec1[x] * vec2[x] for x in intersection])

     sum1 = sum([vec1[x]**2 for x in vec1.keys()])
     sum2 = sum([vec2[x]**2 for x in vec2.keys()])
     denominator = math.sqrt(sum1) * math.sqrt(sum2)

     if not denominator:
        return 0.0
     else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


## Test pairwise comparison, cosine similarity function

In [9]:
pairwiseCompare(get_cosine, ["hello", "hello world", "lololol "])

[[1.0, 0.7071067811865475, 0.0],
 [0.7071067811865475, 0.9999999999999998, 0.0],
 [0.0, 0.0, 1.0]]

## Calculate Jaccard Similarity

In [10]:
tokenize = lambda doc: doc.lower().split(" ")
def jaccard_similarity(string1, string2):
    query = tokenize(string1)
    document = tokenize(string2)
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)
# pairwiseCompare(jaccard_similarity, ["hello", "hello world", "lololol "]) #test: outputs [[1.0, 0.5, 0.0], [0.5, 1.0, 0.0], [0.0, 0.0, 1.0]]

## TF_IDF Frequency 

In [11]:
#TF_IDF
def term_frequency(term, tokenized_document):
    return tokenized_document.count(term)

def sublinear_term_frequency(term, tokenized_document):
    try:
        return 1 + math.log(tokenized_document.count(term))
    except:
        return 1

def augmented_term_frequency(term, tokenized_document):
    max_count = max([term_frequency(t, tokenized_document) for t in tokenized_document])
    return (0.5 + ((0.5 * term_frequency(term, tokenized_document))/max_count))

def inverse_document_frequencies(tokenized_documents):
    idf_values = {}
    all_tokens_set = set([item for sublist in tokenized_documents for item in sublist])
    for tkn in all_tokens_set:
        contains_token = map(lambda doc: tkn in doc, tokenized_documents)
        idf_values[tkn] = 1 + math.log(len(tokenized_documents)/(sum(contains_token)))
    return idf_values
def tfidf(documents):
    tokenized_documents = [tokenize(d) for d in documents]
    idf = inverse_document_frequencies(tokenized_documents)
    tfidf_documents = []
    for document in tokenized_documents:
        doc_tfidf = []
        for term in idf.keys():
            tf = sublinear_term_frequency(term, document)
            doc_tfidf.append(tf * idf[term])
        tfidf_documents.append(doc_tfidf)
    return tfidf_documents
#tfidf(["hello", "hello world", "lololol "]) #test. outputs [[2.09861228866811, 1.4054651081081644, 2.09861228866811, 2.09861228866811],
 #[2.09861228866811, 1.4054651081081644, 2.09861228866811, 2.09861228866811],
 #[2.09861228866811, 1.4054651081081644, 2.09861228866811, 2.09861228866811]]

### Function to calculate average for cosine, Jaccard, and TF_IDF

In [12]:
def matrixmean(matrix):
    mylist=[]
    for row in matrix:
        mylist.append(statistics.mean(row))
    return statistics.mean(mylist)

## Write pairwise cos similarity to .csv

In [None]:
def matrixmean(matrix):
    mylist=[]
    for row in matrix:
        mylist.append(statistics.mean(row))
    return statistics.mean(mylist)
        
def writePairwiseCos(dateKey):
    filekey=re.sub('/','', dateKey)
    filename='cosine/pairwiseCos' + filekey + '.csv'
    if not os.path.isfile(filename):
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, dialect = 'excel')
            todaysTweets = allTweetsByDate [dateKey]
            todayCos = pairwiseCompare(get_cosine, todaysTweets)
            for row in todayCos:
                writer.writerow(row)
        csvfile.close()
        return  matrixmean (todayCos)
    else:
        return -1 #indicates error. Presence of -1 in cosAverages list means writePairwiseCos has been called too many times
!mkdir cosine

cosAverages = []
for key in allTweetsByDate.keys():
    cosAverages.append(writePairwiseCos(key))

## Write pairwise Jaccard similarity 

In [None]:
def writePairwiseJaccard(dateKey):
    filekey=re.sub('/','', dateKey)
    filename='jaccard/pairwiseJaccard' + filekey + '.csv'
    if not os.path.isfile(filename):
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, dialect = 'excel')
            todaysTweets = allTweetsByDate [dateKey]
            todayJaccard = pairwiseCompare(jaccard_similarity, todaysTweets)
            for row in todayJaccard:
                writer.writerow(row)
        csvfile.close()
        return  matrixmean (todayJaccard)
    else:
        return 0

!mkdir jaccard
jaccardAverages=[]
for key in allTweetsByDate.keys():
    jaccardAverages.append(writePairwiseJaccard(key))


## Write pairwise TF_IDF

In [None]:
!mkdir tfidf

In [None]:
def write_tfidf(dateKey):
    filekey=re.sub('/','', dateKey)
    filename='tfidf/pairwiseTFIDF' + filekey + '.csv'
    if not os.path.isfile(filename):
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, dialect = 'excel')
            todaysTweets = allTweetsByDate [dateKey]
            todayTFIDF = tfidf(todaysTweets)
            for row in todayTFIDF:
                writer.writerow(row)
            csvfile.close()
        return matrixmean(todayTFIDF)
    else:
        return 0

tfidfAverages = []
for key in allTweetsByDate.keys():
    tfidfAverages.append(write_tfidf(key))

## Write daily averages to .csv file

In [None]:
!touch dailyaverages.csv

In [None]:
averagesList =[]
datecount=0
for dateKey in allTweetsByDate.keys(): 
    dailyAverages = [dateKey, cosAverages[datecount], jaccardAverages[datecount], tfidfAverages[datecount]]
    averagesList.append(dailyAverages)
    datecount += 1

with open("dailyaverages.csv", 'w') as csvfile:
    writer = csv.writer(csvfile)  
    for row in averagesList:
        writer.writerow(row)
 


## Keyword Analysis

In [None]:
def tokens(string):
    return re.findall(r"[\w']+", string.upper())

stoplistfilename = "stopList.txt"

with open(stoplistfilename, 'r') as file:
    rawStop = file.read()
stopwords = tokens(rawStop)

print(stopwords) #test: returns ['THE', 'A', 'IN'] for test stoplist

In [None]:
def stripStopWords(tokenList):
    result=[]
    for word in tokenList:
        if (not (word in stopwords)):
            result.append(word)
    return result
#print(stripStopWords(tokens("The unicycle is in a shop that is the best"))) #test: returns ['UNICYCLE', 'IS', 'SHOP', 'THAT', 'IS', 'BEST'] for test stoplist


In [None]:
#[len(list(group)) for key, group in groupby(a)]
def topTenWords(listOfTweets):
    """Relies on presence of global variable stopwords"""
    tok =  []
    for tweet in listOfTweets:
        stripped = stripStopWords(tokens(tweet))
        tok += stripped
    tokDict = Counter(tok)
    tokCounts = list(reversed(sorted(tokDict.items(), key=operator.itemgetter(1)))) #list of tuples (token, # of occurrences)
    topTen = tokCounts[:10]
    return topTen





print(topTenWords(["hello hi", "hello goodbye plastic hello", "hello my cat's breath smells like cat food", "hello plastic I ate too much plastic candy", "Hello Joe!", "I pandas"]))
    

In [None]:
!mkdir topten

In [None]:
def write_topten(dateKey):
    filekey=re.sub('/','', dateKey)
    filename='topten/toptenfor' + filekey + '.csv'
    if not os.path.isfile(filename):
        with open(filename, 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, dialect = 'excel')
            todaysTweets = allTweetsByDate [dateKey]
            todayTopTen = topTenWords(todaysTweets)
            for row in todayTopTen:
                writer.writerow(row)
            csvfile.close()
for key in allTweetsByDate.keys():
    write_topten(key)
    

In [None]:
groupby(["hello", "goodbye", "hi", "hello"])