# Import pandas and NumPy libraries.

In [41]:
import pandas as pd
import numpy as np

# Read the tweet_engagements.csv file as pandas DataFrame.

In [42]:
tweets = pd.read_csv("tweet_engagements.csv")

# Explore the dataset using pandas functions.

In [43]:
tweets.head()

Unnamed: 0,Date,Impression,Engagement
0,1.11.2020,506,106
1,2.11.2020,331,50
2,3.11.2020,377,86
3,4.11.2020,333,108
4,5.11.2020,558,31


In [44]:
tweets.tail()

Unnamed: 0,Date,Impression,Engagement
117,26.04.2021,319,26
118,27.04.2021,397,119
119,28.04.2021,531,124
120,29.04.2021,150,28
121,30.04.2021,253,40


In [45]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Date        122 non-null    object
 1   Impression  122 non-null    int64 
 2   Engagement  122 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 3.0+ KB


In [46]:
tweets.describe()

Unnamed: 0,Impression,Engagement
count,122.0,122.0
mean,358.04918,53.860656
std,147.129552,37.894659
min,111.0,7.0
25%,228.0,25.25
50%,360.0,44.0
75%,478.75,72.0
max,619.0,187.0


In [8]:
tweets[tweets.duplicated() == True]

Unnamed: 0,Date,Impression,Engagement


In [9]:
for cn,c in tweets.iteritems():
    display(c.value_counts().nlargest(n = 5))

1.11.2020     1
31.03.2021    1
29.03.2021    1
28.03.2021    1
27.03.2021    1
Name: Date, dtype: int64

360    3
112    2
196    2
582    2
124    2
Name: Impression, dtype: int64

36    4
45    4
50    3
21    3
86    3
Name: Engagement, dtype: int64

In [10]:
for cn,c in tweets.iteritems():
    display(c.value_counts().nsmallest(n = 6))

1.11.2020     1
31.03.2021    1
29.03.2021    1
28.03.2021    1
27.03.2021    1
26.03.2021    1
Name: Date, dtype: int64

416    1
571    1
280    1
147    1
252    1
158    1
Name: Impression, dtype: int64

75     1
117    1
180    1
103    1
51     1
102    1
Name: Engagement, dtype: int64

In [11]:
tweets.nunique()

Date          122
Impression    111
Engagement     76
dtype: int64

In [12]:
for cn,c in tweets.iteritems():
    display(c.value_counts(normalize = True) * 100)

1.11.2020     0.819672
31.03.2021    0.819672
29.03.2021    0.819672
28.03.2021    0.819672
27.03.2021    0.819672
                ...   
7.12.2020     0.819672
6.12.2020     0.819672
5.12.2020     0.819672
4.12.2020     0.819672
30.04.2021    0.819672
Name: Date, Length: 122, dtype: float64

360    2.459016
112    1.639344
196    1.639344
582    1.639344
124    1.639344
         ...   
552    0.819672
227    0.819672
297    0.819672
465    0.819672
253    0.819672
Name: Impression, Length: 111, dtype: float64

36     3.278689
45     3.278689
50     2.459016
21     2.459016
86     2.459016
         ...   
112    0.819672
157    0.819672
42     0.819672
27     0.819672
124    0.819672
Name: Engagement, Length: 76, dtype: float64

# Transforming the Dataset

## Generate a DataFrame that contains week numbers in rows, week day names in columns, and engagement rates as values.

In [47]:
# Transform the Date column into a pandas date-time format
tweets["Date"] = pd.to_datetime(tweets["Date"])

In [48]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        122 non-null    datetime64[ns]
 1   Impression  122 non-null    int64         
 2   Engagement  122 non-null    int64         
dtypes: datetime64[ns](1), int64(2)
memory usage: 3.0 KB


In [49]:
tweets.head()

Unnamed: 0,Date,Impression,Engagement
0,2020-01-11,506,106
1,2020-02-11,331,50
2,2020-03-11,377,86
3,2020-04-11,333,108
4,2020-05-11,558,31


In [None]:
tweets.apply(func= , axis=1)

In [18]:
# column that contains the names of the days of the week from Date.
tweets['week_day'] = tweets["Date"].dt.day_name()

In [15]:
# Create a week_number column that contains the week number of the year from Date. Consider Monday as the starting day of the week.
tweets['week_number'] = tweets['Date'].dt.isocalendar().week 

In [20]:
# Create a rate column that contains an engagement-per-impression rate.
tweets['rate'] = tweets['Engagement'] / tweets['Impression']

In [21]:
tweets.head()

Unnamed: 0,Date,Impression,Engagement,week_day,week_number,rate
0,2020-01-11,506,106,Saturday,2,0.209486
1,2020-02-11,331,50,Tuesday,7,0.151057
2,2020-03-11,377,86,Wednesday,11,0.228117
3,2020-04-11,333,108,Saturday,15,0.324324
4,2020-05-11,558,31,Monday,20,0.055556


In [36]:
# Generate a DataFrame that contains week numbers in rows, week day names in columns, and engagement rates as values.
engagement_df = tweets.pivot_table(index = "week_number", columns= "week_day", values= "rate")

In [37]:
engagement_df.head()

week_day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
week_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,,0.095628,,,,,
2,,,0.209486,0.059102,,,
5,,,,,0.200573,,0.07124
7,,,,,,0.151057,0.180905
9,,,,,0.181159,,0.151786


In [38]:
len(engagement_df)

35

In [39]:
# Drop the rows of the DataFrame that contain any NA value.
engagement_df.dropna(axis=0)

week_day,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
week_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
11,0.060475,0.143275,0.041667,0.195777,0.083788,0.335196,0.203971
12,0.120275,0.04,0.042857,0.126095,0.184211,0.325,0.27381
16,0.17053,0.225,0.150794,0.049145,0.095506,0.179211,0.117808
47,0.190291,0.215496,0.178808,0.11215,0.134146,0.060071,0.077358
48,0.231526,0.135321,0.05659,0.048828,0.071291,0.347134,0.103211
51,0.072034,0.09009,0.180505,0.035514,0.103896,0.288421,0.140127
52,0.279359,0.21608,0.030516,0.134328,0.207516,0.312187,0.273171
