Analyze sample tweets and save in sqLite databases

In [3]:
#import necessary modules
import sqlite3 
import numpy as np 
import pandas as pd 
import string 
import re 
import nltk
nltk.download("stopwords")
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import sklearn

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/karengrundy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/karengrundy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/karengrundy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/karengrundy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#create the database if it doesn't exist
conn = sqlite3.connect('tweets.db') 
c = conn.cursor()

listOfTables = ['jimmyfallon', 'StephenAtHome']

In [10]:
def table_exists(table_name): 
    c.execute('''SELECT count(name) FROM sqlite_master WHERE TYPE = 'table' AND name = '{}' '''.format(table_name)) 
    if c.fetchone()[0] == 1: 
        return True 
    return False

def create_tables(table_name):
    if not table_exists(table_name): 
        c.execute(''' 
            CREATE TABLE {}( 
                created_at BLOB, 
                text TEXT, 
                url TEXT, 
                replies INT, 
                retweets INT,
                favorites INT,
                user TEXT,
                whichFile TEXT 
            ) 
        '''.format(table_name))

def get_tweets(table_name, top_n): 
    if table_exists(table_name):
        if top_n > 0:
            c.execute("SELECT * FROM {} LIMIT {}".format(table_name, top_n)) 
        else:
            c.execute("SELECT * FROM {}".format(table_name)) 
        data = [] 
        for row in c.fetchall(): 
            data.append(row) 
        return data
    else:
        return "Table {} does not exist.".format(table_name)

#insert tweets into the jimmyfallon table
def insert_tweet(table_name, created_at, text, url, replies, retweets, favorites, user, whichFile): 
    c.execute(''' INSERT INTO {} (created_at, text, url, replies, retweets, favorites, user, whichFile) VALUES(?, ?, ?, ?, ?, ?, ?, ?) '''.format(table_name), (created_at, text, url, replies, retweets, favorites, user, whichFile)) 
    conn.commit()

def delete_tweets(table_name):
    if table_exists(table_name):
        c.execute('''DELETE FROM {}'''.format(table_name))
    else:
        return "Table {} does not exist.".format(table_name)

def delete_table(table_name):
    c.execute('''DROP TABLE IF EXISTS {}'''.format(table_name))



In [11]:
#Import the files, add a column for the file name and add to table in the sqLite database
#import the two csv files into a data frames
dfJimmyFallonTweets = pd.read_csv("jimmyfallon.csv", encoding="ISO-8859-1")
dfStephenAtHomeTweets = pd.read_csv("StephenAtHome.csv", encoding="ISO-8859-1")

dfJimmyFallonTweets['whichFile'] = 'jimmyfallon.csv'
dfStephenAtHomeTweets['whichFile'] = 'StephenAtHome.csv'

In [12]:
#delete the tweets if they exist
delete_table('jimmyfallon')
delete_table('StephenAtHome')
delete_table('tweets')

In [13]:
#add the tables 
create_tables('tweets')
for index, row in dfJimmyFallonTweets.iterrows():
    insert_tweet('tweets',row['created_at'], row['text'], row['url'], row['replies'], row['retweets'], row['favorites'], row['user'], row['whichFile'])

for index, row in dfStephenAtHomeTweets.iterrows():
    insert_tweet('tweets', row['created_at'], row['text'], row['url'], row['replies'], row['retweets'], row['favorites'], row['user'], row['whichFile'])


In [18]:
#create a dataframe of the combined set of tweets
dfWithTweets = pd.DataFrame(get_tweets('tweets', 0), columns=['created_at', 'text', 'url', 'replies', 'retweets', 'favorites', 'user', 'whichFile'])

In [16]:
#create a function to clean the data in tweet text
def returnCleanedData(data):
    """user regular expressions to remove all html tags from string data passed to this function and return the cleaned data"""
    data = str(data) #convert the data to a string
    dataThatHasBeenCleaned = re.sub("<.*?>","", data)
    return dataThatHasBeenCleaned

In [22]:
#clean the data in the combined dataframe 
dfWithTweets["text"] = dfWithTweets["text"].apply(returnCleanedData)

In [24]:
#Read the positive and negative sets
with open("positive.txt", "r") as myFile:
    listOfPositiveWords = myFile.read().split("\n")

with open("negative.txt", "r") as myFile:
    listOfNegativeWords = myFile.read().split("\n")

In [26]:
# create a list that will keep track of the calculated score for each tweet
listWithOverallScore = []

#Loop through each tweet and clean the text in the "text" field
for index, row in dfWithTweets.iterrows():
    # reset the counter to 0, otherwise, your numbers would reflect the prior iteration of the loop
    positiveCounter = 0
    negativeCounter = 0

    eachTweet = row["text"]

    # Begin cleaning the data
    # start by making it lowercase - notice we are creating a new variable to store the cleaned data in
    eachTweetCleaned = eachTweet.lower()

    # remove punctuation
    eachTweetCleaned = eachTweetCleaned.translate(str.maketrans("", "", string.punctuation))

    # remove all whitespace characters (space, tab, newline, return, formfeed, etc.)
    eachTweetCleaned = " ".join(eachTweetCleaned.split())

    # just in case, remove extra grouped spaces
    eachTweetCleaned = re.sub("\s\s+", " ", eachTweetCleaned)

    # split eachTweetCleaned up into a list of words to look at each word and increment the appropriate counter        
    listOfWords = eachTweetCleaned.split(" ")
    for eachWord in listOfWords:
        if eachWord in listOfPositiveWords:
            positiveCounter = positiveCounter + 1
        elif eachWord in listOfNegativeWords:
            negativeCounter = negativeCounter + 1

    # keep track of the item's score in the list listWithOverallScore list variable since we will add that back into the dataframe outside the loop
    listWithOverallScore.append(positiveCounter-negativeCounter)

dfWithTweets['score'] = listWithOverallScore    
    

In [28]:
#Create a table in the database to store the dataframe with scores
if not table_exists('sentimentScore'): 
    c.execute(''' 
        CREATE TABLE {}( 
            created_at BLOB, 
            text TEXT, 
            url TEXT, 
            replies INT, 
            retweets INT,
            favorites INT,
            user TEXT,
            whichFile TEXT,
            score INT 
        ) 
    '''.format('sentimentScore'))

In [34]:
def insert_tweet(created_at, text, url, replies, retweets, favorites, user, whichFile, score): 
    c.execute(''' INSERT INTO {} (created_at, text, url, replies, retweets, favorites, user, whichFile, score) VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?) '''.format('sentimentScore'), (created_at, text, url, replies, retweets, favorites, user, whichFile, score)) 
    conn.commit()

for index, row in dfWithTweets.iterrows():
    insert_tweet(row['created_at'], row['text'], row['url'], row['replies'], row['retweets'], row['favorites'], row['user'], row['whichFile'], row['score'])

In [36]:
c.execute("SELECT * FROM {} LIMIT {}".format('sentimentScore', 5)) 
data = [] 
for row in c.fetchall(): 
    data.append(row) 

pd.DataFrame(data).head()



Unnamed: 0,0,1,2,3,4,5,6,7,8
0,10/30/2019 1:43,Hit the woah. See you at the movies. This next...,"<a href=""http://twitter.com/download/iphone"" r...",1360,85,1687,jimmyfallon,jimmyfallon.csv,1
1,10/28/2019 22:42,@GameXplain @Nintendo Not bad.,"<a href=""http://twitter.com/download/iphone"" r...",360,36,881,jimmyfallon,jimmyfallon.csv,-1
2,10/27/2019 22:51,Playing Ring Fit Adventure on @Nintendo switch...,"<a href=""http://twitter.com/download/iphone"" r...",12501,1389,15594,jimmyfallon,jimmyfallon.csv,2
3,10/27/2019 22:37,Speaking of (his movie) #PlayingWithFire! @Joh...,"<a href=""http://twitter.com/download/iphone"" r...",1080,90,1174,jimmyfallon,jimmyfallon.csv,1
4,10/27/2019 14:05,Weve got a brand new #FallonTonight show TONI...,"<a href=""http://twitter.com/download/iphone"" r...",1672,88,922,jimmyfallon,jimmyfallon.csv,-1


In [37]:
#close the cursor
c.close()

# and close the connection to the database
try:
    conn.close()
    print("connection closed")
except:
    print("connection already closed")

connection closed
