### Load options, Groupby, Lambdas and Functions

This notebook gives you more sophisticated methods of maniputaling your data using lambda functions and more customized functions as well as a way to create more sophisticated ways to summarize your data. 

In [1]:
import pandas as pd

**NEW:** 
Every function comes with various options you can specify. Below we are looking at assigning data types you can assign to each column

In [16]:
%%time
tweets = pd.read_csv(
    '../data/ira_tweets_csv_hashed.csv', 
    dtype = {'tweetid': 'str','retweet_tweetid':'str'}, # <-- here you can specify data as strings, floats or integers,
    parse_dates = ['account_creation_date', 'tweet_time'] # <-- this line makes pandas interpret these columns as dates   
)

CPU times: user 1min 49s, sys: 1min 32s, total: 3min 21s
Wall time: 5min 59s


In [17]:
len(tweets)

9041308

In [21]:
tweets.dtypes

tweetid                             object
userid                              object
user_display_name                   object
user_screen_name                    object
user_reported_location              object
user_profile_description            object
user_profile_url                    object
follower_count                       int64
following_count                      int64
account_creation_date       datetime64[ns]
account_language                    object
tweet_language                      object
tweet_text                          object
tweet_time                  datetime64[ns]
tweet_client_name                   object
in_reply_to_tweetid                float64
in_reply_to_userid                  object
quoted_tweet_tweetid               float64
is_retweet                            bool
retweet_userid                      object
retweet_tweetid                     object
latitude                           float64
longitude                          float64
quote_count

In [22]:
tweets.head().T

Unnamed: 0,0,1,2,3,4
tweetid,877919995476496385,492388766930444288,719455077589721089,536179342423105537,841410788409630720
userid,249064136b1c5cb00a705316ab73dd9b53785748ab757f...,0974d5dbee4ca9bd6c3b46d62a5cbdbd5c0d86e196b624...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...
user_display_name,249064136b1c5cb00a705316ab73dd9b53785748ab757f...,0974d5dbee4ca9bd6c3b46d62a5cbdbd5c0d86e196b624...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...
user_screen_name,249064136b1c5cb00a705316ab73dd9b53785748ab757f...,0974d5dbee4ca9bd6c3b46d62a5cbdbd5c0d86e196b624...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...
user_reported_location,"Москва, Россия",Россия,Рязань,Рязань,
user_profile_description,Я примерный семьянин!,Телефонист .Изучение истории Игра в любитель...,волны так и плещут фиолетовой волной,волны так и плещут фиолетовой волной,Отвечаю на любой #ВопросПрезиденту
user_profile_url,,,,,http://t.co/3CVqbMQFbf
follower_count,132,74,165,165,4430
following_count,120,8,454,454,4413
account_creation_date,2013-12-07 00:00:00,2014-03-15 00:00:00,2014-04-29 00:00:00,2014-04-29 00:00:00,2012-02-25 00:00:00


In [23]:
%%time
tweets_2016_2018 = tweets[
    tweets['tweet_time'].dt.year > 2015
]

CPU times: user 3.48 s, sys: 17.9 s, total: 21.3 s
Wall time: 48 s


In [24]:
print(len(tweets_2016_2018))
tweets_2016_2018.head()

3179248


Unnamed: 0,tweetid,userid,user_display_name,user_screen_name,user_reported_location,user_profile_description,user_profile_url,follower_count,following_count,account_creation_date,...,latitude,longitude,quote_count,reply_count,like_count,retweet_count,hashtags,urls,user_mentions,poll_choices
0,877919995476496385,249064136b1c5cb00a705316ab73dd9b53785748ab757f...,249064136b1c5cb00a705316ab73dd9b53785748ab757f...,249064136b1c5cb00a705316ab73dd9b53785748ab757f...,"Москва, Россия",Я примерный семьянин!,,132,120,2013-12-07,...,,,0.0,0.0,0.0,0.0,[],[http://ru-open.livejournal.com/374284.html],[2572896396],
2,719455077589721089,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,bda40f262856eee77c48a332e5eb23bc4f1943d600867d...,Рязань,волны так и плещут фиолетовой волной,,165,454,2014-04-29,...,,,0.0,0.0,0.0,0.0,[],[https://www.youtube.com/watch?v=9GvpImWxTJc],[40807205],
4,841410788409630720,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...,,Отвечаю на любой #ВопросПрезиденту,http://t.co/3CVqbMQFbf,4430,4413,2012-02-25,...,,,0.0,0.0,3.0,4.0,[],[https://goo.gl/fBp94X],,
5,834365760776630272,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...,a53ed619f1dea6015c7c878bf744b0eefe8f7272dccf34...,,Отвечаю на любой #ВопросПрезиденту,http://t.co/3CVqbMQFbf,4430,4413,2012-02-25,...,,,0.0,0.0,3.0,5.0,[],[https://goo.gl/9w5hso],,
13,701008777001107457,f9dded769bb2275fc6531c1b4d10cf05272c48806fbd73...,f9dded769bb2275fc6531c1b4d10cf05272c48806fbd73...,f9dded769bb2275fc6531c1b4d10cf05272c48806fbd73...,,,,86,279,2015-09-28,...,,,0.0,0.0,0.0,0.0,[],[http://bit.ly/1PMZqPI],,


In [90]:
grouped_tweets = tweets_2016_2018.groupby(['account_language','user_reported_location'])['tweetid'].count()

In [91]:
grouped_tweets

account_language  user_reported_location        
ar                Az-Zarqa                              1
                  Egypt                                 1
                  United States                       193
                  الدمام                                7
                  سوريا                                12
                  سورية                                 5
de                Berlin, Deutschland               28451
                  Bremen, Deutschland                 228
                  Deutschland                         230
                  Dresden, Sachsen                   7920
                  Düsseldorf, Deutschland             306
                  Erfurt, Deutschland                4053
                  Frankfurt am Main, Deutschland     5099
                  Frankfurt am Main, Hessen           137
                  Germany                            2524
                  Hamburg, Deutschland              14634
                  Hesse

In [94]:
grouped_tweets.reset_index()

Unnamed: 0,account_language,user_reported_location,tweetid
0,ar,Az-Zarqa,1
1,ar,Egypt,1
2,ar,United States,193
3,ar,الدمام,7
4,ar,سوريا,12
5,ar,سورية,5
6,de,"Berlin, Deutschland",28451
7,de,"Bremen, Deutschland",228
8,de,Deutschland,230
9,de,"Dresden, Sachsen",7920


In [96]:
grouped_tweets.reset_index().sort_values(by = 'tweetid', ascending=False)

Unnamed: 0,account_language,user_reported_location,tweetid
191,en,USA,476040
252,es,Estados Unidos,285012
195,en,United States,246566
360,ru,"Санкт-Петербург, Россия",65393
326,ru,Москва,62175
351,ru,Россия,55356
61,en,"Chicago, IL",43390
329,ru,"Москва, Россия",40486
36,en,Atlanta,29047
6,de,"Berlin, Deutschland",28451


### Lambdas
Lambdas are functions you can use to manipulate your columns. Think of them as mini-functions. You use them in conjunction with the `.apply()` function.

In [98]:
num_tweets_per_country_language = grouped_tweets.reset_index().sort_values(by = 'tweetid', ascending=False)

In [99]:
num_tweets_per_country_language['tweetid'].apply(lambda x: x/len(tweets_2016_2018) *100)

191    14.973352
252     8.964761
195     7.755482
360     2.056870
326     1.955651
351     1.741166
61      1.364788
329     1.273446
36      0.913644
6       0.894897
104     0.834443
188     0.805536
177     0.792420
359     0.755430
148     0.745491
381     0.742754
112     0.666195
180     0.655155
37      0.646725
164     0.646222
190     0.625966
179     0.614674
350     0.536731
162     0.535535
41      0.535127
63      0.463978
15      0.460298
327     0.458693
42      0.458599
178     0.449540
         ...    
26      0.000157
266     0.000157
80      0.000126
371     0.000126
261     0.000094
122     0.000094
369     0.000063
114     0.000063
136     0.000063
124     0.000063
43      0.000063
203     0.000031
1       0.000031
389     0.000031
228     0.000031
245     0.000031
168     0.000031
185     0.000031
31      0.000031
271     0.000031
70      0.000031
44      0.000031
47      0.000031
116     0.000031
73      0.000031
52      0.000031
56      0.000031
308     0.0000

In [102]:
num_tweets_per_country_language['percent_of_all_tweets'] = num_tweets_per_country_language['tweetid'].apply(lambda x: (x/len(tweets_2016_2018)) *100)

In [103]:
num_tweets_per_country_language.head()

Unnamed: 0,account_language,user_reported_location,tweetid,percent_of_all_tweets
191,en,USA,476040,14.973352
252,es,Estados Unidos,285012,8.964761
195,en,United States,246566,7.755482
360,ru,"Санкт-Петербург, Россия",65393,2.05687
326,ru,Москва,62175,1.955651


Here's how you do the same thing with a function:

In [104]:
def calculate_pct(x):
    return (x/len(tweets_2016_2018)) * 100

In [105]:
num_tweets_per_country_language['percent_of_all_tweets2'] = num_tweets_per_country_language['tweetid'].apply(calculate_pct)

In [106]:
num_tweets_per_country_language.head()

Unnamed: 0,account_language,user_reported_location,tweetid,percent_of_all_tweets,percent_of_all_tweets2
191,en,USA,476040,14.973352,14.973352
252,es,Estados Unidos,285012,8.964761,8.964761
195,en,United States,246566,7.755482,7.755482
360,ru,"Санкт-Петербург, Россия",65393,2.05687,2.05687
326,ru,Москва,62175,1.955651,1.955651
