<center>
<img src="https://laelgelcpublic.s3.sa-east-1.amazonaws.com/lael_50_years_narrow_white.png.no_years.400px_96dpi.png" width="300" alt="LAEL 50 years logo">
<h3>APPLIED LINGUISTICS GRADUATE PROGRAMME (LAEL)</h3>
</center>
<hr>

# Corpus Linguistics - Study 1 - Mariana - Dataset preparation

## Tweet Object documentation

Please refer to [Tweet Object](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet)

## Importing the required libraries

In [1]:
import pandas as pd

## Data wrangling

### Importing the tweet raw data into a dataframe

In [2]:
df_tweets_raw_data = pd.read_json('mari201901.jsonl', lines=True)

In [3]:
df_tweets_raw_data.head(5)

Unnamed: 0,created_at,entities,favorite_count,favorited,filter_level,id,id_str,is_quote_status,lang,quote_count,...,possibly_sensitive,in_reply_to_screen_name,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_status_id,in_reply_to_status_id_str,coordinates,geo,place,withheld_in_countries
0,2019-01-15 14:07:11+00:00,"{'hashtags': [{'indices': [67, 77], 'text': 'V...",0,False,low,1085176574671048705,1085176574671048704,False,es,0,...,,,,,,,,,,
1,2019-01-15 14:07:11+00:00,"{'hashtags': [{'indices': [67, 77], 'text': 'V...",0,False,low,1085176574671048705,1085176574671048704,False,es,0,...,,,,,,,,,,
2,2019-01-13 15:05:36+00:00,"{'hashtags': [{'indices': [38, 48], 'text': 'V...",0,False,low,1084466499962707968,1084466499962707968,True,en,0,...,,,,,,,,,,
3,2019-01-13 15:05:36+00:00,"{'hashtags': [{'indices': [38, 48], 'text': 'V...",0,False,low,1084466499962707968,1084466499962707968,True,en,0,...,,,,,,,,,,
4,2019-01-13 13:36:24+00:00,"{'hashtags': [{'indices': [28, 38], 'text': 'V...",0,False,low,1084444052030976002,1084444052030976000,True,es,0,...,,,,,,,,,,


In [4]:
df_tweets_raw_data.shape

(38831, 36)

### Checking if data types are consistent

In [5]:
df_tweets_raw_data.dtypes

created_at                   datetime64[ns, UTC]
entities                                  object
favorite_count                             int64
favorited                                   bool
filter_level                              object
id                                         int64
id_str                                     int64
is_quote_status                             bool
lang                                      object
quote_count                                int64
reply_count                                int64
retweet_count                              int64
retweeted                                   bool
retweeted_status                          object
source                                    object
text                                      object
timestamp_ms                      datetime64[ns]
truncated                                   bool
user                                      object
quoted_status                             object
quoted_status_id    

#### Converting `id` column's data type to `str` for future use

Note: For some unknown reason, pandas has imported the attribute `id_str` incorrectly in some cases. Therefore, `id` is  being used instead.

In [6]:
df_tweets_raw_data['id'] = df_tweets_raw_data['id'].astype(str)

### Dropping unnecessary columns

#### Listing the columns

In [7]:
df_tweets_raw_data.columns.values.tolist()

['created_at',
 'entities',
 'favorite_count',
 'favorited',
 'filter_level',
 'id',
 'id_str',
 'is_quote_status',
 'lang',
 'quote_count',
 'reply_count',
 'retweet_count',
 'retweeted',
 'retweeted_status',
 'source',
 'text',
 'timestamp_ms',
 'truncated',
 'user',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'quoted_status_permalink',
 'extended_tweet',
 'display_text_range',
 'extended_entities',
 'possibly_sensitive',
 'in_reply_to_screen_name',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'coordinates',
 'geo',
 'place',
 'withheld_in_countries']

#### Selecting the columns that are being dropped

In [8]:
df_tweets_raw_data = df_tweets_raw_data.drop(columns=[
#    'created_at',
    'entities',
    'favorite_count',
    'favorited',
    'filter_level',
#    'id',
    'id_str',
    'is_quote_status',
#    'lang',
    'quote_count',
    'reply_count',
    'retweet_count',
    'retweeted',
    'retweeted_status',
    'source',
#    'text',
    'timestamp_ms',
    'truncated',
#    'user',
    'quoted_status',
    'quoted_status_id',
    'quoted_status_id_str',
    'quoted_status_permalink',
    'extended_tweet',
    'display_text_range',
    'extended_entities',
    'possibly_sensitive',
    'in_reply_to_screen_name',
    'in_reply_to_user_id',
    'in_reply_to_user_id_str',
    'in_reply_to_status_id',
    'in_reply_to_status_id_str',
    'coordinates',
    'geo',
    'place',
    'withheld_in_countries'
])

In [9]:
df_tweets_raw_data.columns.values.tolist()

['created_at', 'id', 'lang', 'text', 'user']

### Listing the values of the parameter `lang`

In [10]:
df_tweets_raw_data['lang'].unique()

array(['es', 'en', 'und', 'pt', 'ca', 'fr', 'eu', 'it', 'de', 'ar', 'ht',
       'zh', 'fa', 'tr', 'sv', 'cy', 'ur', 'ro', 'in', 'uk', 'el', 'hi',
       'nl', 'pl', 'ru', 'cs', 'tl', 'fi', 'no', 'lt', 'ja', 'et', 'sr',
       'hu', 'da'], dtype=object)

### Keeping only the tweets in Portuguese

In [11]:
df_tweets_raw_data = df_tweets_raw_data[df_tweets_raw_data['lang'] == 'pt'].reset_index(drop=True)

### Extracting the column `username`

In [12]:
# Flatten the nested JSON 'user' attribute
df_tweets_raw_data_flattened_user = pd.json_normalize(df_tweets_raw_data['user'])

# Extract the 'screen_name' attribute
username = df_tweets_raw_data_flattened_user['screen_name']

# Create a new column 'username'
df_tweets_raw_data['username'] = username

### Extracting the column `author_id`

In [13]:
# Extract the 'id_str' attribute
author_id = df_tweets_raw_data_flattened_user['id_str']

# Create a new column 'username'
df_tweets_raw_data['author_id'] = author_id

### Extracting the column `tweet_url`

In [14]:
# Construct the tweet URL using the tweet ID and user's screen name
df_tweets_raw_data['tweet_url'] = (
    'https://twitter.com/' + 
    df_tweets_raw_data['username'] + 
    '/status/' + 
    df_tweets_raw_data['id']
)

In [15]:
df_tweets_raw_data

Unnamed: 0,created_at,id,lang,text,user,username,author_id,tweet_url
0,2019-01-13 16:02:20+00:00,1084480777365139458,pt,RT @RenovaMidia: #URGENTE\n\nServiço de Inteli...,"{'contributors_enabled': False, 'created_at': ...",TaconThiago,1057374436624609281,https://twitter.com/TaconThiago/status/1084480...
1,2019-01-13 16:02:20+00:00,1084480777365139458,pt,RT @RenovaMidia: #URGENTE\n\nServiço de Inteli...,"{'contributors_enabled': False, 'created_at': ...",TaconThiago,1057374436624609281,https://twitter.com/TaconThiago/status/1084480...
2,2019-01-13 16:00:40+00:00,1084480357959901186,pt,RT @RenovaMidia: #URGENTE\n\nServiço de Inteli...,"{'contributors_enabled': False, 'created_at': ...",Brunobr18373270,1076139993599541248,https://twitter.com/Brunobr18373270/status/108...
3,2019-01-13 16:00:40+00:00,1084480357959901186,pt,RT @RenovaMidia: #URGENTE\n\nServiço de Inteli...,"{'contributors_enabled': False, 'created_at': ...",Brunobr18373270,1076139993599541248,https://twitter.com/Brunobr18373270/status/108...
4,2019-01-13 15:57:54+00:00,1084479661709631494,pt,RT @RenovaMidia: #URGENTE\n\nServiço de Inteli...,"{'contributors_enabled': False, 'created_at': ...",Pedrodon17,1052964117181583361,https://twitter.com/Pedrodon17/status/10844796...
...,...,...,...,...,...,...,...,...
677,2019-01-24 07:38:52+00:00,1088340342347313152,pt,RT @EnioMainardi: Gleisi disse que #trump se m...,"{'contributors_enabled': False, 'created_at': ...",Tioito1,982506039055716352,https://twitter.com/Tioito1/status/10883403423...
678,2019-01-24 07:49:09+00:00,1088342930245464065,pt,RT @PolitzOficial: #CONFIRMADO!\n\nGENERAL DE ...,"{'contributors_enabled': False, 'created_at': ...",jonh__fox,896205547,https://twitter.com/jonh__fox/status/108834293...
679,2019-01-22 08:37:44+00:00,1087630380897832960,pt,RT @GaliciaAberta: Miranda reúnese cos preside...,"{'contributors_enabled': False, 'created_at': ...",ppdegalicia,13494262,https://twitter.com/ppdegalicia/status/1087630...
680,2019-01-24 08:29:48+00:00,1088353160173928448,pt,RT @rogergzz: Amor para #Venezuela 💛💙❤️,"{'contributors_enabled': False, 'created_at': ...",mavitrejo,980262593515356160,https://twitter.com/mavitrejo/status/108835316...


### Inspecting the data

In [16]:
inspected_row = 450
print('username:' + df_tweets_raw_data.loc[inspected_row, 'username'])
print('text:' + df_tweets_raw_data.loc[inspected_row, 'text'])
print('tweet_url:' + df_tweets_raw_data.loc[inspected_row, 'tweet_url'])

username:sillaemimbre
text:RT @RenovaMidia: O grupo Rumbo Libertad espera que o presidente interino da #Venezuela, Juan Guaidó, "ordene a captura de Nicolás Maduro e…
tweet_url:https://twitter.com/sillaemimbre/status/1088577916118728704


### Creating the file `mari20192020.tsv`

In [17]:
df_tweets_raw_data[['created_at', 'author_id', 'username', 'tweet_url', 'text']].to_csv('mari20192020.tsv', sep='\t', index=False, encoding='utf-8', lineterminator='\n')