In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('dataquest_tweets_csv.csv')
df.head()

Unnamed: 0,date,time,tweet,mentions,photos,replies_count,retweets_count,likes_count
0,2020-06-29,09:36:13,"""Going from 15k to 170k in salary...There's no...",[],['https://pbs.twimg.com/media/EbrvY0SWoAcEQWn....,0,0,0
1,2020-06-26,14:14:44,"Hi Tom, sorry about that! We try to respond to...",['beingtomiwa'],[],0,0,0
2,2020-06-26,12:00:14,Become an @rstudio power user: https://bit.ly/...,['rstudio'],[],0,2,7
3,2020-06-26,11:23:21,Please get in touch with our support team by e...,['jimohkassim'],[],0,0,0
4,2020-06-26,02:00:05,Learn to master R markdown in this free tutori...,[],[],0,2,7


## Adding a Pandas Column with a True/False Condition Using np.where()

To accomplish this, we’ll use numpy’s built-in where() function. This function takes three arguments in sequence: the condition we’re testing for, the value to assign to our new column if that condition is true, and the value to assign if it is false. It looks like this:

np.where(condition, value if condition is true, value if condition is false)

In [3]:
df['hasimage'] = np.where(df['photos']!= '[]', True, False)
df.head()

Unnamed: 0,date,time,tweet,mentions,photos,replies_count,retweets_count,likes_count,hasimage
0,2020-06-29,09:36:13,"""Going from 15k to 170k in salary...There's no...",[],['https://pbs.twimg.com/media/EbrvY0SWoAcEQWn....,0,0,0,True
1,2020-06-26,14:14:44,"Hi Tom, sorry about that! We try to respond to...",['beingtomiwa'],[],0,0,0,False
2,2020-06-26,12:00:14,Become an @rstudio power user: https://bit.ly/...,['rstudio'],[],0,2,7,False
3,2020-06-26,11:23:21,Please get in touch with our support team by e...,['jimohkassim'],[],0,0,0,False
4,2020-06-26,02:00:05,Learn to master R markdown in this free tutori...,[],[],0,2,7,False


In [4]:
image_tweets = df[df['hasimage'] == True]
no_image_tweets = df[df['hasimage'] == False]

In [5]:
## LIKES
print('Average likes, all tweets: ' + str(df['likes_count'].mean()))
print('Average likes, image tweets: ' + str(image_tweets['likes_count'].mean()))
print('Average likes, no image tweets: ' + str(no_image_tweets['likes_count'].mean()))
print('\n')

## RETWEETS
print('Average RTs, all tweets: ' + str(df['retweets_count'].mean()))
print('Average RTs, image tweets: ' + str(image_tweets['retweets_count'].mean()))
print('Average RTs, no image tweets: ' + str(no_image_tweets['retweets_count'].mean()))

Average likes, all tweets: 6.209759328770148
Average likes, image tweets: 14.21042471042471
Average likes, no image tweets: 5.176514584891549


Average RTs, all tweets: 1.5553102230072864
Average RTs, image tweets: 3.5386100386100385
Average RTs, no image tweets: 1.2991772625280478


## Adding a Pandas Column with More Complicated Conditions

In [6]:
# create a list of our conditions
conditions = [
    (df['likes_count'] <= 2),
    (df['likes_count'] > 2) & (df['likes_count'] <= 9),
    (df['likes_count'] > 9) & (df['likes_count'] <= 15),
    (df['likes_count'] > 15)
    ]

# create a list of the values we want to assign for each condition
values = ['tier_4', 'tier_3', 'tier_2', 'tier_1']

# create a new column and use np.select to assign values to it using our lists as arguments
df['tier'] = np.select(conditions, values)

# display updated DataFrame
df.head()

Unnamed: 0,date,time,tweet,mentions,photos,replies_count,retweets_count,likes_count,hasimage,tier
0,2020-06-29,09:36:13,"""Going from 15k to 170k in salary...There's no...",[],['https://pbs.twimg.com/media/EbrvY0SWoAcEQWn....,0,0,0,True,tier_4
1,2020-06-26,14:14:44,"Hi Tom, sorry about that! We try to respond to...",['beingtomiwa'],[],0,0,0,False,tier_4
2,2020-06-26,12:00:14,Become an @rstudio power user: https://bit.ly/...,['rstudio'],[],0,2,7,False,tier_3
3,2020-06-26,11:23:21,Please get in touch with our support team by e...,['jimohkassim'],[],0,0,0,False,tier_4
4,2020-06-26,02:00:05,Learn to master R markdown in this free tutori...,[],[],0,2,7,False,tier_3


In [7]:
#tier 4 tweets
df[(df['tier'] == 'tier_4')]['hasimage'].value_counts(normalize=True)

False    0.948784
True     0.051216
Name: hasimage, dtype: float64

In [8]:
#tier 1 tweets
df[(df['tier'] == 'tier_1')]['hasimage'].value_counts(normalize=True)

False    0.836842
True     0.163158
Name: hasimage, dtype: float64