# First tweet

### Load the data and count.

In [1]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import tweet_iter, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Filepaths of the files to load.
filepaths = ['d59d27e2f2ed4778881573df2ecf2fad_001.json.gz',
            '25319652321b4bb498b250ffc53aa0f0_001.json.gz']

def tweet_transform(tweet):
    return {
        'tweet_id': tweet['id_str'], 
        'tweet_created_at': date_parse(tweet['created_at']),
        'user_id': tweet['user']['id_str'],
        'screen_name': tweet['user']['screen_name'],
        'user_created_at': date_parse(tweet['user']['created_at']),
        'tweets_to_date': tweet['user']['statuses_count'],
        'tweet_type': tweet_type(tweet)
    }

tweet_df = pd.DataFrame(tweet_iter(filepaths, tweet_transform_func=tweet_transform), 
                        columns=['tweet_id', 'user_id', 'screen_name', 'tweet_created_at', 
                                 'user_created_at', 'tweets_to_date', 'tweet_type'])
tweet_df.count()

DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
DEBUG:root:Loaded 450000
DEBUG:root:Loaded 500000
DEBUG:root:Loaded 550000
DEBUG:root:Loaded 600000
DEBUG:root:Loaded 650000
DEBUG:root:Loaded 700000
DEBUG:root:Loaded 750000
DEBUG:root:Loaded 800000
DEBUG:root:Loaded 850000
DEBUG:root:Loaded 900000
DEBUG:root:Loaded 950000
DEBUG:root:Loaded 1000000
DEBUG:root:Loaded 1050000
DEBUG:root:Loaded 1100000
DEBUG:root:Loaded 1150000
DEBUG:root:Loaded 1200000
DEBUG:root:Loaded 1250000
DEBUG:root:Loaded 1300000
DEBUG:root:Loaded 1350000
DEBUG:root:Loaded 1400000
DEBUG:root:Loaded 1450000
DEBUG:root:Loaded 1500000
DEBUG:root:Loaded 0
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
DEBUG:root:Loaded 300000
DEBUG:root:Loaded 350000
DEBUG:root:Loaded 400000
D

tweet_id            3364440
user_id             3364440
screen_name         3364440
tweet_created_at    3364440
user_created_at     3364440
tweets_to_date      3364440
tweet_type          3364440
dtype: int64

### View the top of the data.

In [2]:
tweet_df.head()

Unnamed: 0,tweet_id,user_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
0,847821180832804864,1638925448,A_Childers_,2017-03-31 14:41:35+00:00,2013-08-01 21:44:28+00:00,6071,retweet
1,847814632643473411,1638925448,A_Childers_,2017-03-31 14:15:34+00:00,2013-08-01 21:44:28+00:00,6071,retweet
2,847627543142219776,1638925448,A_Childers_,2017-03-31 01:52:09+00:00,2013-08-01 21:44:28+00:00,6071,reply
3,847597404719267841,1638925448,A_Childers_,2017-03-30 23:52:23+00:00,2013-08-01 21:44:28+00:00,6071,reply
4,847593734896324608,1638925448,A_Childers_,2017-03-30 23:37:48+00:00,2013-08-01 21:44:28+00:00,6071,reply


### Remove duplicates
Dupes happen when collecting data from Twitter API.

In [3]:
len(tweet_df['tweet_id'].unique())

3335489

In [4]:
dedupe_tweet_df = tweet_df.drop_duplicates(['tweet_id'], keep='last')
len(dedupe_tweet_df)

3335489

### Number of tweets in dataset for each user

In [5]:
tweet_count_df = pd.DataFrame(dedupe_tweet_df['user_id'].value_counts()).rename(columns={'user_id': 'tweets_in_dataset'})
tweet_count_df.index.name = 'user_id'
tweet_count_df.count()

tweets_in_dataset    1443
dtype: int64

In [6]:
tweet_count_df.head()

Unnamed: 0_level_0,tweets_in_dataset
user_id,Unnamed: 1_level_1
3817401,5286
22891564,4321
456994513,4273
593813785,4110
15146659,3945


In [7]:
# Get the first tweet for each user
first_tweet_df = dedupe_tweet_df.loc[dedupe_tweet_df.groupby('user_id')['tweet_created_at'].idxmin()].set_index(['user_id'])
first_tweet_df.count()

tweet_id            1443
screen_name         1443
tweet_created_at    1443
user_created_at     1443
tweets_to_date      1443
tweet_type          1443
dtype: int64

In [8]:
first_tweet_df.head()

Unnamed: 0_level_0,tweet_id,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweet_type
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
100165378,619906732052074496,ChristineSisto,2015-07-11 16:30:56+00:00,2009-12-29 07:27:27+00:00,8646,retweet
1001991865,289090058148012033,FredSchulte,2013-01-09 19:23:35+00:00,2012-12-10 16:16:10+00:00,888,reply
1002229862,425802092465623040,HMRothmandc,2014-01-22 01:28:24+00:00,2012-12-10 18:37:13+00:00,1777,reply
100270054,740945974143635464,Laubarth,2016-06-09 16:37:41+00:00,2009-12-29 17:02:01+00:00,6,original
100802089,7240989598,ayesharascoe,2009-12-31 17:27:25+00:00,2009-12-31 16:48:11+00:00,491,original


In [9]:
# Merge with number of tweets in dataset for each user
first_tweet_merge_df = first_tweet_df.join(tweet_count_df).drop(['tweet_id', 'tweet_type'], axis=1)
first_tweet_merge_df.count()

screen_name          1443
tweet_created_at     1443
user_created_at      1443
tweets_to_date       1443
tweets_in_dataset    1443
dtype: int64

## First tweet for each user  <----------
For each user, the date of the first tweet in the dataset, the date the account was created, the number of tweets to date (roughly), and the tweets in the dataset.

If the user_created_at and tweet_created_at are close, then this is probably a new account.
If the user_created_at and tweet_created_at are not close, but there is a small number of tweets then this user probably started tweeting recently (like a new account).
If the user_created_at and tweet_created_at are not close and there is a large number of tweets then this is probably a prolific tweeter. Note that not all tweets for this user were probably collected.

In [10]:
first_tweet_merge_df.sort_values('tweet_created_at', ascending=False).head(20)

Unnamed: 0_level_0,screen_name,tweet_created_at,user_created_at,tweets_to_date,tweets_in_dataset
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
76696176,sklee_ca,2017-03-31 17:07:58+00:00,2009-09-23 17:09:53+00:00,2,2
66768858,emmaroller,2017-03-27 13:25:07+00:00,2009-08-18 19:10:55+00:00,223,210
842787331224584192,RebeccaEHoffman,2017-03-17 17:27:58+00:00,2017-03-17 17:18:52+00:00,9,9
831972200014045191,ErinMcManus15,2017-03-15 14:16:31+00:00,2017-02-15 21:03:24+00:00,1,1
20281013,EvanMcS,2017-03-10 16:33:43+00:00,2009-02-06 23:09:59+00:00,146,196
18825339,CahnEmily,2017-03-10 13:43:41+00:00,2009-01-10 03:19:50+00:00,86548,3205
30176025,LaurenFCarroll,2017-03-08 20:33:21+00:00,2009-04-10 06:29:32+00:00,34,49
3817401,ericgeller,2017-03-07 16:38:59+00:00,2007-04-08 20:27:11+00:00,186181,5286
21612122,HotlineJosh,2017-03-03 22:00:52+00:00,2009-02-22 23:45:46+00:00,143393,3227
22891564,chrisgeidner,2017-03-02 16:35:58+00:00,2009-03-05 06:48:00+00:00,193071,4321


## Types of tweets <----------
While all the tweets are loaded, summarize the types of tweets.

In [11]:
dedupe_tweet_df['tweet_type'].value_counts()

original    1593541
retweet     1094028
reply        396287
quote        251633
Name: tweet_type, dtype: int64