# Information Retrieval and Web Analytics



# GROUP NAMES:
- Judith Camacho 218863 
- Jordi Marín    207552 
- Xavier Vives   218900


# **PART 1**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


First we add all the imports:


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
from collections import defaultdict
from array import array
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import math
import re
import json
import pandas as pd
from datetime import datetime

In [71]:
!pip install demoji
import demoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# 1. Reading and loading the dataset

In [4]:
data_path = "/content/drive/Shareddrives/IRWA/Project/Part 1/data/"
docs_path = data_path+'tw_hurricane_data.json'
doc_id_mapper = data_path+'tweet_document_ids_map.csv'
doc_id_mapper = pd.read_csv(doc_id_mapper,sep='\t',header=None, index_col=[1])
with open(docs_path) as fp:
    lines = fp.readlines()
tw_json = [json.loads(l.strip().replace(' +', ' ')) for l in lines] # We convert string to json right at this stage

Now we want to check how many documents there are:


In [5]:
print("Total number of tweets in the dataset: {}".format(len(tw_json)))

Total number of tweets in the dataset: 4000


# 2. Processing the text

First we identify what are the fields that we are going to need. We know that the final output must return (when
present) the following information for each of the selected documents: 
*Tweet |
Username | Date | Hashtags | Likes | Retweets | Url (here the “Url” means the
tweet link).*

From the data and the official API website (https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet), we know that the names of the fields that we need are:

- "text"
- "user.name" 
- "created_at"
- "entities.hashtags.text" 
- "favorite_count"
- "retweet_count"
- "urls" MUST CLARIFY THIS FIELD

Now we want for each document pre-process it the following way:
- Removing stop words 
- Tokenization 
- Removing punctuation marks 
- Stemming


In [6]:
def build_terms(tweet_text):
    '''
    input: the text of a tweet
    function: pre-process the text as said above
    output: a list of strings of the processed text
    '''
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words("english"))

    tweet_text = tweet_text.lower() ## Transform in lowercase
    tweet_text = re.sub(r'[^\w\s]', '', tweet_text) #remove punctuation marks and hashtags using regex

    tweet_text = tweet_text.split() ## Tokenize the text to get a list of terms
    tweet_text = [x for x in tweet_text if x not in stop_words]  ##eliminate the stopwords
    tweet_text = [stemmer.stem(x) for x in tweet_text] ## perform stemming 
    return tweet_text

We use a string as example to check if our function works:

In [7]:
t = "HEllo,  @pi!ksa, csgo is best game) ever #teamliquid #esl"
lol = build_terms(t)
lol

['hello', 'piksa', 'csgo', 'best', 'game', 'ever', 'teamliquid', 'esl']

# 3. Parse the fields of the JSON

Now we will make a function that for each tweet, it parses the json so that we take the fields that we need. First we print a random document to see it's structure:

In [8]:
tw_json[10]

{'created_at': 'Fri Sep 30 18:38:37 0000 2022',
 'id': 1575918052595552256,
 'id_str': '1575918052595552256',
 'full_text': "Today's edition of The Smoke Eater is a photo essay on #HurricaneIan.\n\nAlso, there are animal bleps.\n\nhttps://t.co/ZwjBziC9jf",
 'truncated': False,
 'display_text_range': [0, 125],
 'entities': {'hashtags': [{'text': 'HurricaneIan', 'indices': [55, 68]}],
  'symbols': [],
  'user_mentions': [],
  'urls': [{'url': 'https://t.co/ZwjBziC9jf',
    'expanded_url': 'https://smokeeater.substack.com/p/the-smoke-eater-for-sept-30-2022',
    'display_url': 'smokeeater.substack.com/p/the-smoke-ea…',
    'indices': [102, 125]}]},
 'metadata': {'iso_language_code': 'en', 'result_type': 'recent'},
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id'

Now we build all the functions to parse the fields that we need:

In [9]:
def get_text(tweet):
    '''
    input: a json document

    function: we iterate through all entities that the tweet has,
    for each entity we check the indices sub-field, this indicates
    where in the full_text the entity is located, then we replace the entity
    with a stop word, we chose the # symbol, so later when stopwords are removed
    the string is fully cleaned

    output: the full_text free of entities (URLs, hashtags, emojis, etc.)
    '''
    text = tweet['full_text']
    for entity in tweet['entities']:
        for e in tweet['entities'][entity]:
            try:
                a = e['indices']
                text = text[:a[0]]+'#'*(a[1]-a[0])+text[a[1]:]
            except:
                continue
    return re.sub(r'[#]', '', text)

In [10]:
def get_username(tweet):
    '''
    input: a json document
    function: parse username
    output: username of the tweet
    '''
    return tweet['user']['name']    

In [11]:
def get_hashtags(tweet):
    '''
    input: a json document
    function: parse hashtags
    output: hashtags of the tweet
    '''
    return [e['text'] for e in tweet['entities']['hashtags']]

In [12]:
def get_likes(tweet):
    '''
    input: a json document
    function: parse number of likes
    output: number of likes of the tweet
    '''
    k = tweet['favorite_count']
    if k:
         return k
    else:
         return 0

In [13]:
def get_retweets(tweet):
    '''
    input: a json document
    function: parse number of retweets
    output: number of retweets of the tweet
    '''
    k = tweet['retweet_count']
    if k:
        return k
    else:
        return 0

In [78]:
def get_url(user, id_tw):
    '''
    example: https://twitter.com/IbaiLlanos/status/1462299324797304832
    input: a user name, and tweet id
    function: construct url with id and username, removing emojis and encoding spaces properly
    output: url of the tweet
    '''
    return 'https://twitter.com/'+demoji.replace(re.sub(r' ', '_', user),'')+'/status/'+str(id_tw)

In [37]:
def get_date(tweet):
    '''
    input: a json document
    function: parse date and return in '%Y-%m-%d %H:%M:%S' format
    output: date of the tweet
    '''
    return datetime.strftime(datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S 0000 %Y'), '%Y-%m-%d %H:%M:%S')

In [16]:
def get_id(tweet):
    '''
    input: a json document
    function: parse id
    output: id of the tweet
    '''
    return tweet['id']

In [28]:
def get_doc(tweet):
    '''
    input: a json document
    function: checks in the given CSV the parsing of the tweet id to document number,
    and formats the string as we want
    output: the document number the input tweet is in
    '''
    return doc_id_mapper.loc[[get_id(tweet)]].values[0][0]

Now for each tweet we put together all the info we have parsed:


In [73]:
def get_info(tweet):
    '''
    input: a json document
    function: given a tweet, it creates a dictionary, and for each field-->key, adds as value the information
    output: relevant information we need
    '''
    info = {}
    info['Tweet'] = get_text(tweet) #This is not tokenized/stemmed with build_terms() yet
    info['Username'] = get_username(tweet)
    info['Date'] = get_date(tweet)
    info['Hashtags'] = get_hashtags(tweet)
    info['Likes'] = get_likes(tweet)
    info['Retweets'] = get_retweets(tweet)
    info['Url'] = get_url(info['Username'], get_id(tweet))
    info['Doc'] = get_doc(tweet)
    info["processed_text"] = build_terms(info['Tweet'])
    return info

Now we will make a dictionary with all the tweets and their corresponding relevant information

In [74]:
tweet_df_dict = {}
for tweet in tw_json:
    tweet_df_dict[get_id(tweet)] = get_info(tweet)

We convert tweet_df_dic to a Pandas Dataframe so that it will be easier for us to use the data in the future

In [75]:
tweet_df = pd.DataFrame.from_dict(tweet_df_dict, orient='index')

In [76]:
tweet_df.head()

Unnamed: 0,Tweet,Username,Date,Hashtags,Likes,Retweets,Url,Doc,processed_text
1575918182698979328,So this will keep spinning over us until 7 pm…...,Suz👻,2022-09-30 18:39:08,[HurricaneIan],0,0,https://twitter.com/Suz/status/157591818269897...,doc_1,"[keep, spin, us, 7, pmgo, away, alreadi]"
1575918151862304768,Our hearts go out to all those affected by . W...,Lytx,2022-09-30 18:39:01,[HurricaneIan],0,0,https://twitter.com/Lytx/status/15759181518623...,doc_2,"[heart, go, affect, wish, everyon, road, curre..."
1575918140839673873,Kissimmee neighborhood off of Michigan Ave. \n,Christopher Heath,2022-09-30 18:38:58,[HurricaneIan],0,0,https://twitter.com/Christopher_Heath/status/1...,doc_3,"[kissimme, neighborhood, michigan, ave]"
1575918135009738752,I have this one tree in my backyard that scare...,alex ✨,2022-09-30 18:38:57,"[scwx, HurricaneIan]",0,0,https://twitter.com/alex_/status/1575918135009...,doc_4,"[one, tree, backyard, scare, poltergeist, tree..."
1575918119251419136,"I pray for everyone affected by , but...",Tess 💋,2022-09-30 18:38:53,[HurricaneIan],0,0,https://twitter.com/Tess_/status/1575918119251...,doc_5,"[pray, everyon, affect, associ, winknew, sympa..."
