# Tweets Collection

Up to 300 tweets were collected for each movies. This was done in order to gain further insight on the masses opinions on the movies. As Twitter is popular as a microblogging platform, it was the perfect place to garner opinions. We used GetOldTweets3, an API to fetch tweets through results from the search engine. 

In [None]:
#Import Time
import time
from time import sleep
from datetime import datetime
from datetime import date
from dateutil.relativedelta import relativedelta
#Importing Tweepy
import tweepy
from tweepy import OAuthHandler
#Import Pandas
import pandas as pd
#Import CSV
import csv
#Import module to retrieve tweets
import GetOldTweets3 as got
import sys
username = sys.argv[1]

In [None]:
#For Preprocessing
#Importing pandas
import pandas as pd
#Import string for list of punctuation
import string
#Import natural language toolkit
import nltk
#import list of stop words
from nltk.corpus import stopwords
#import tokenizer
from nltk.tokenize import RegexpTokenizer
#import lemmatizer
from nltk.stem import WordNetLemmatizer
#import numpy
import numpy as np
import spacy
pd.options.mode.chained_assignment = None
import emoji
import emot

## Pre-processing
We performed EDA on the dataset and here are the following observations: 

1) Since Twitter was only founded in mid 2006 and because we wanted to retrieve tweets within 6 months before and after the release, we had to drop movies that were before 2007.

2) Replaced the '-' in the release date to '/' as the API reads in dates in the format 'YYYY/MM/DD'.

In [None]:
#Clean data
df = pd.read_csv("tmdb_movies_data.csv")

In [None]:
# Removing movies before 2007 and changing datetime format
df = df[(df["release_year"] >= 2007)
df['release_date'] = pd.to_datetime(df['release_date'], utc = False)
df['release_date'] = df['release_date'].astype(str)
df['release_date'] = df['release_date'].str.replace("-","/")

In [None]:
# Retrieving dates 6 months before and after movie 
for row,line in df.iterrows():
        rows = []
        movie_name = line['original_title']
        rows.append(movie_name)
        released = str(line['release_date'])
        print(released)
        datetime_object = datetime.strptime(released, '%Y/%m/%d')
        date_from = (datetime_object + relativedelta(months=-6)).date()
        date_until = (datetime_object + relativedelta(months=+6)).date()        

### Twitter Scraping Code

We had to perform multiple for loops and put the model to sleep whenever HTTP Error occurs (too many requests)

In [None]:
# twitter scraping code
tweets_col = []
searched_tweets = []
last_id = -1

def getTweets():
    done = False
    count = 0
    end = len(df)
    loop_count = 1
    for i in range(0,100):
            try:
                print("starting loop: ",loop_count)
                with open ('twitterdata_{}.csv'.format(loop_count), 'w', newline = "") as f:
                    thewriter = csv.writer(f) 
                    thewriter.writerow(['Movie_Title', 'Date', "Tweet_Date","Tweet_Content", "Num_Retweets", "Hashtags"])
                    for row in range(count,end):
                        if (count == end - 1):
                            done = True
                        rows = []
                        movie_name = df.iloc[row]['original_title']
                        rows.append(movie_name)
                        released = str(df.iloc[row]['release_date'])
                        rows.append(released)
                        datetime_object = datetime.strptime(released, '%Y/%m/%d')
                        date_from = str((datetime_object + relativedelta(months=-6)).date())
                        date_until = str((datetime_object + relativedelta(months=+6)).date())
                        tweetCriteria = got.manager.TweetCriteria().setQuerySearch(movie_name.lower()).setSince(date_from).setUntil(date_until).setMaxTweets(300).setLang('en')
                        tweet = got.manager.TweetManager.getTweets(tweetCriteria)
                        print("movies done: ",count,)
                        count += 1
                        for i in range (len(tweet)):
                            rows.append(tweet[i].date)
                            rows.append(tweet[i].text)
                            rows.append(tweet[i].retweets)
                            rows.append(tweet[i].hashtags)
                            thewriter.writerow(rows)
                            rows = rows[:2]
            except:
                print("HHTPerror: sleeping for 300secs")
                loop_count +=1
                time.sleep(300)
                continue
            if done:
                break

In [None]:
getTweets()

In [None]:
df.to_csv('tweet_content.csv', index = False)

# Cleaning the Results

After exporting the results into a new csv, it was time to clean the csv in order to prepare for Sentiment Analysis. Here is what we did:

1) Drop duplicate rows with same tweet content 

2) Convert tweets to lowercase

3) Remove punctuation from tweets

4) Remove digits from tweets

5) Remove individual movie names from tweets

6) Remove stop words from tweets

7) Remove HTTP links from tweets
 
8) Convert emoticons to words from tweets

9) Remove emojis from tweets

10) Chatword conversion from tweets

11) Lemmatize tweets

12) Drop duplicate rows with same cleaned tweets

In [None]:
df = pd.read_csv('tweet_content.csv', engine = "python")

In [None]:
df.info()

### 1) Dropping Duplicates

In [None]:
df = df.drop_duplicates(subset='Tweet_Content', keep="first")

In [None]:
df["Tweet_Content"] = df["Tweet_Content"].astype(str)

### 2) Converting Tweets to Lowercase

In [None]:
df["cleaned"] = df["Tweet_Content"].str.lower()

### 3) Remove HTTP Links

In [None]:
def remove_http(text):
    text = re.sub('http[s]?://\S+', '', text)
    return text

In [None]:
df["cleaned"] = df["cleaned"].apply(lambda x: remove_http(x))

### 4) Remove Punctuations

In [None]:
PUNCT_TO_REMOVE = string.punctuation

def remove_punctuation(text):
    """custom function to remove the punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

In [None]:
df["cleaned"] = df["cleaned"].apply(lambda text: remove_punctuation(text))
df.head()

### 5) Remove Digits

In [None]:
df["cleaned_2"] = df["cleaned"].apply(lambda x: "".join([i for i in x if not i.isdigit()]))

### 6) Remove Movie Names 

In [None]:
loopcount = 1
for i in range(0, len(df)):
    print ("sentiment done:", loopcount)
    name = df['Movie_Title'].iloc[i].lower()
    split = name.split()
    big_regex = re.compile('|'.join(map(re.escape, split)))
    the_message = big_regex.sub("", df['cleaned_2'].iloc[i])
    df['cleaned_2'].iloc[i] = the_message

In [None]:
df = df.drop(columns=['cleaned'])

### 7) Removing Stopwords and 10 most frequent words

In [None]:
# removing stop words
", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

df["text_wo_stop"] = df["cleaned_2"].apply(lambda text: remove_stopwords(text))
df.head()

In [None]:
# removing top 10 frequent words
from collections import Counter
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1

In [None]:
# removing top 10 frequent words
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):
    """custom function to remove the frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

df["cleaned_2"] = df["text_wo_stop"].apply(lambda text: remove_freqwords(text))
df.head()

In [None]:
df = df.drop(columns=['text_wo_stop'])
df['Cleaned_Tweets'] = df['cleaned_2']
df = df.drop(columns = ['cleaned_2'])

### 8) Converting Emoticons to Words

In [None]:
import emot
EMOTICONS = emot.EMOTICONS
def convert_emoticons(text):
    for emot in EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(EMOTICONS[emot].replace(",","").split()).replace("_"," ").lower(), text)
    return text

In [None]:
df["Cleaned_Tweets"] = df["Cleaned_Tweets"].apply(lambda x: convert_emoticons(x))

### 9) Removing Emojis

In [None]:
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [None]:
df["Cleaned_Tweets"] = df["Cleaned_Tweets"].apply(lambda x: remove_emoji(x))

### 10) Chatword Conversion

In [None]:
chat_words_str = """
AFAIK=As Far As I Know
AFK=Away From Keyboard
ASAP=As Soon As Possible
ATK=At The Keyboard
ATM=At The Moment
A3=Anytime, Anywhere, Anyplace
BAK=Back At Keyboard
BBL=Be Back Later
BBS=Be Back Soon
BFN=Bye For Now
B4N=Bye For Now
BRB=Be Right Back
BRT=Be Right There
BTW=By The Way
B4=Before
B4N=Bye For Now
CU=See You
CUL8R=See You Later
CYA=See You
FAQ=Frequently Asked Questions
FC=Fingers Crossed
FWIW=For What It's Worth
FYI=For Your Information
GAL=Get A Life
GG=Good Game
GN=Good Night
GMTA=Great Minds Think Alike
GR8=Great!
G9=Genius
IC=I See
ICQ=I Seek you (also a chat program)
ILU=ILU: I Love You
IMHO=In My Honest/Humble Opinion
IMO=In My Opinion
IOW=In Other Words
IRL=In Real Life
KISS=Keep It Simple, Stupid
LDR=Long Distance Relationship
LMAO=Laugh My A.. Off
LOL=Laughing Out Loud
LTNS=Long Time No See
L8R=Later
MTE=My Thoughts Exactly
M8=Mate
NRN=No Reply Necessary
OIC=Oh I See
PITA=Pain In The A..
PRT=Party
PRW=Parents Are Watching
ROFL=Rolling On The Floor Laughing
ROFLOL=Rolling On The Floor Laughing Out Loud
ROTFLMAO=Rolling On The Floor Laughing My A.. Off
SK8=Skate
STATS=Your sex and age
ASL=Age, Sex, Location
THX=Thank You
TTFN=Ta-Ta For Now!
TTYL=Talk To You Later
U=You
U2=You Too
U4E=Yours For Ever
WB=Welcome Back
WTF=What The F...
WTG=Way To Go!
WUF=Where Are You From?
W8=Wait...
7K=Sick:-D Laugher
"""

In [None]:
# chatwords conversion

chat_words_map_dict = {}

chat_words_list = []

for line in chat_words_str.split("\n"):
    
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()].lower())
        else:
            new_text.append(w)
    return " ".join(new_text)

chat_words_conversion("one minute BRB")

In [None]:
df["Cleaned_Tweets"] = df["Cleaned_Tweets"].apply(lambda x: chat_words_conversion(x))

### 11) Lemmatize

In [None]:
df = df[df['Cleaned_Tweets'].notna()]

lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

df["Cleaned_Tweets"] = df["Cleaned_Tweets"].apply(lambda text: lemmatize_words(text))
df.head()

### 12) Dropping Duplicates from Results

In [None]:
# Realised that there were a few duplicated tweets collected by the api and we remove them accordingly.
df = df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'])
df = df.drop_duplicates(subset='Cleaned_Tweets', keep="first")

# Sentiment Analysis

We have decided to use Textblob to conduct sentiment analysis. After getting the sentiments for each tweet, we proceeded to average the polarity values out for each movie. 

In [None]:
import textblob
from textblob import TextBlob

In [None]:
# takess long to run
def getSentiment():
    loopcount = 1
    for i in range(0, len(df)):
        print ("sentiment done:", loopcount)
        pol = TextBlob((df['Cleaned_Tweets'].iloc[i])).sentiment.polarity
        subj = TextBlob((df['Cleaned_Tweets'].iloc[i])).sentiment.subjectivity
        df.loc[df.index[i], 'polarity'] = pol
        df.loc[df.index[i], 'subjectivity'] = subj
        if pol > 0:
            df.loc[df.index[i], 'sentiment'] = 'pos'
        else:
            df.loc[df.index[i], 'sentiment'] = 'neg'
        loopcount+=1

In [None]:
getSentiment()

In [None]:
# CSV with the full tweet content and individual sentiment analysis
df.to_csv('combined_pp.csv', index = False)

In [None]:
# Averaging the polarity results 
df["polarity"] = pd.to_numeric(df.polarity, errors='coerce')
df_new = df.groupby(df['''tmdb_id''']).aggregate({'''polarity''':'mean'})
df_new.drop(df_new.index[0])

In [None]:
# CSV with movies sentiment analysis
df_new.to_csv('sentimental.csv')