# Tweets Featuring

Loading the file capturing tweets, create some new features

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
%load_ext watermark
%watermark -v -m -p numpy,pandas -g

import re
from tqdm import tqdm
import yaml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import watermark
import emoji

CPython 3.7.3
IPython 7.5.0

numpy 1.16.4
pandas 0.24.2

compiler   : GCC 7.3.0
system     : Linux
release    : 5.0.0-19-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit
Git hash   : c077a50bb88d22f95b9db6e256624c4701eb7011


### Constants

Modify these values to update the config file, the query or the output file.

In [2]:
INPUT_FILE = "tweets.csv"
OUTPUT_FILE = "tweets-processed.csv"

In [3]:
tweets_df = pd.read_csv(INPUT_FILE)

In [4]:
tweets_df.head(5)

Unnamed: 0,screen_name,location,source,coordinates,favorite_count,favorited,lang,hashtags,created_at,text
0,ccchapman3103,"MN, AZ, TX, USA",Twitter Web Client,,0,False,en,"[{'text': 'Plato', 'indices': [70, 76]}, {'tex...",2019-07-02 07:50:54,"RT @DaviesWriter: At the touch of a lover, eve..."
1,DonRon777,,Twitter Web App,,0,False,en,[],2019-07-02 07:48:36,RT @777Liquid: What do you think about the new...
2,lavenderlens,sightseeing at the cathedral💀,Twitter for iPhone,,0,False,en,"[{'text': 'sfx', 'indices': [89, 93]}, {'text'...",2019-07-02 07:47:27,RT @katesfxmakeup: Time to write a new blog bu...
3,AYoungNegus,Chicago,Twitter for iPhone,,0,False,en,"[{'text': 'blackdynamite', 'indices': [97, 111...",2019-07-02 07:45:22,RT @jaiganticstudio: One of the greatest scene...
4,EmpireDynamic,50 MILLION VIEWS MONTHLY,Twibble.io,,0,False,en,"[{'text': 'boxoffice', 'indices': [93, 103]}]",2019-07-02 07:45:11,Cineflix Acquires Global Rights to Israel-Iran...


In [5]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
screen_name       1000 non-null object
location          779 non-null object
source            1000 non-null object
coordinates       16 non-null object
favorite_count    1000 non-null int64
favorited         1000 non-null bool
lang              1000 non-null object
hashtags          1000 non-null object
created_at        1000 non-null object
text              1000 non-null object
dtypes: bool(1), int64(1), object(8)
memory usage: 71.4+ KB


In [6]:
tweets_df.describe()

Unnamed: 0,favorite_count
count,1000.0
mean,0.792
std,4.123559
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,78.0


From previous information, location has a certain number of NaN which we can replace for the string 'unknown' and coordinates has very little information as it is mostly null values

In [7]:
tweets_df['location'].fillna('unknown', inplace=True)

In [8]:
print('Kwnown coordinates: {}%'.format(100 * tweets_df.coordinates.count() / tweets_df.shape[0]))

Kwnown coordinates: 1.6%


In [9]:
tweets_df.drop(['coordinates'], axis=1, inplace=True)

## Create extra features

* Extract tags embeded in the tweet:

In [10]:
tweets_df.text[0]

'RT @DaviesWriter: At the touch of a lover, everyone becomes a poet.\n― #Plato \n\n#poetry #amwriting #film #screenwriting #Hollywood #acting…'

In [11]:
tag_regex = re.compile(r'#[\w]+')
def get_tags(text, regex=tag_regex):
    tags = regex.findall(text)
    return [k.replace('#', '') for k in tags]

In [12]:
tweets_df['tags'] = tweets_df.text.apply(get_tags, args = (tag_regex,))

In [13]:
def clean_tags(text, tags):
    # Eliminate tags from the text
    for tag in tags:
        text = re.sub(' +', ' ', text.replace('#' + tag, '').replace('\n', ' ') \
                      .replace('\r', '').replace(' ― ', '').replace(' …', ''))
    
    return text.strip()     # Clean up

In [14]:
tweets_df['text'] = tweets_df[['text', 'tags']].apply(lambda x: clean_tags(*x), axis=1)
tweets_df['text'][0]        # Verify updates...

'RT @DaviesWriter: At the touch of a lover, everyone becomes a poet.'

* Retweets

In [15]:
retweet_regex = re.compile(r'RT\s@[\w]+:')
def is_retweet(text, regex=retweet_regex):
    retweet = regex.findall(text)
    is_retweet = False
    author = ''
    if retweet:
        text = text.replace(retweet[0], '').strip()
        is_retweet = True
        author = retweet[0].replace('RT @', '').replace(':', '')
    return is_retweet, author, text

In [16]:
is_retweet, authors, text = zip(*tweets_df.text.apply(is_retweet, args = (retweet_regex,)))

In [17]:
tweets_df['is_retweet'], tweets_df['retweet_author'], tweets_df['text'] = [is_retweet, authors, text]

In [18]:
tweets_df.head(3)

Unnamed: 0,screen_name,location,source,favorite_count,favorited,lang,hashtags,created_at,text,tags,is_retweet,retweet_author
0,ccchapman3103,"MN, AZ, TX, USA",Twitter Web Client,0,False,en,"[{'text': 'Plato', 'indices': [70, 76]}, {'tex...",2019-07-02 07:50:54,"At the touch of a lover, everyone becomes a poet.","[Plato, poetry, amwriting, film, screenwriting...",True,DaviesWriter
1,DonRon777,unknown,Twitter Web App,0,False,en,[],2019-07-02 07:48:36,"What do you think about the new series, ""The C...",[],True,777Liquid
2,lavenderlens,sightseeing at the cathedral💀,Twitter for iPhone,0,False,en,"[{'text': 'sfx', 'indices': [89, 93]}, {'text'...",2019-07-02 07:47:27,Time to write a new blog but too many interest...,"[sfx, makeup, artist, film, workshop, immersiv...",True,katesfxmakeup


* Number of words in the tweet

In [19]:
tweets_df['n_words'] = tweets_df.text.apply(len)

In [20]:
tweets_df['has_link'] = tweets_df.text.apply(lambda x: 'http' in x)

In [21]:
emoji_regex = emoji.get_emoji_regexp()
def capture_emojis(text):
    emojis = emoji_regex.findall(text)
    if emojis:
        emoji_count = len(emojis)
        for e in emojis:
            text = text.replace(e, '')
        text = text.strip()
        emojis = ' '.join(emojis)
    else:
        emoji_count = 0
        emojis = ''
    
    return emoji_count, emojis, text

In [22]:
emoji_count, emojis, text = zip(*tweets_df.text.apply(capture_emojis))
tweets_df['emoji_count'], tweets_df['emojis'], tweets_df['text'] = [emoji_count, emojis, text]

In [23]:
tweets_df.head()

Unnamed: 0,screen_name,location,source,favorite_count,favorited,lang,hashtags,created_at,text,tags,is_retweet,retweet_author,n_words,has_link,emoji_count,emojis
0,ccchapman3103,"MN, AZ, TX, USA",Twitter Web Client,0,False,en,"[{'text': 'Plato', 'indices': [70, 76]}, {'tex...",2019-07-02 07:50:54,"At the touch of a lover, everyone becomes a poet.","[Plato, poetry, amwriting, film, screenwriting...",True,DaviesWriter,49,False,0,
1,DonRon777,unknown,Twitter Web App,0,False,en,[],2019-07-02 07:48:36,"What do you think about the new series, ""The C...",[],True,777Liquid,124,False,0,
2,lavenderlens,sightseeing at the cathedral💀,Twitter for iPhone,0,False,en,"[{'text': 'sfx', 'indices': [89, 93]}, {'text'...",2019-07-02 07:47:27,Time to write a new blog but too many interest...,"[sfx, makeup, artist, film, workshop, immersiv...",True,katesfxmakeup,69,False,0,
3,AYoungNegus,Chicago,Twitter for iPhone,0,False,en,"[{'text': 'blackdynamite', 'indices': [97, 111...",2019-07-02 07:45:22,One of the greatest scenes from Black Dynamite...,"[blackdynamite, comedy, classic, blaxploit]",True,jaiganticstudio,74,False,1,😂
4,EmpireDynamic,50 MILLION VIEWS MONTHLY,Twibble.io,0,False,en,"[{'text': 'boxoffice', 'indices': [93, 103]}]",2019-07-02 07:45:11,Cineflix Acquires Global Rights to Israel-Iran...,[boxoffice],False,,116,True,0,
