In [1]:
import pandas as pd 
import matplotlib.pyplot as plt


In [None]:
# Creating df object for Content.csv file
content_df = pd.read_csv('Content.csv')

# Selecting only required columns
content_df = content_df.iloc[:,[1,3,4]]

# Dropping whole rows which have NAN in any 1 of the column
content_df.dropna(inplace=True)

# Renaming 'Type' column to 'Content_Type' 
content_df.rename(columns={'Type':'Content_Type'},inplace=True)

# Removing " (inverted-commas) & making all Category into lower_case to eliminate repition 
# from Category column
content_df['Category'] = content_df['Category'].str.strip('"').str.lower()

content_df

In [None]:
# Creating df object for Reactions.csv file
reactions_df = pd.read_csv('Reactions.csv')

# Selecting only required columns
reactions_df = reactions_df.iloc[:,[1,3,4]]

# Dropping whole rows which have NAN in any 1 of the column
reactions_df.dropna(inplace=True)

# Renaming 'Type' column to 'Reaction_Type' 
reactions_df.rename(columns={'Type':'Reaction_Type'},inplace='True')

reactions_df

In [None]:
# Creating df object for ReactionTypes.csv file
reactionTypes_df = pd.read_csv('ReactionTypes.csv')

# Selecting only required columns
reactionTypes_df = reactionTypes_df.iloc[:,1:]

# Renaming 'Type' column to 'Reaction_Type' 
reactionTypes_df.rename(columns={'Type':'Reaction_Type'},inplace=True)

reactionTypes_df

In [None]:
#merge reactions df with content df matching Content ID
merged_df = pd.merge(reactions_df,content_df)

# now merging merged_df with reactionTypes df matching Reaction Type
merged_df = merged_df.merge(reactionTypes_df)

# Group by category and sum their respective scores and sort descending & select top five categories 
merged_df.groupby(by='Category')['Score'].sum().sort_values(ascending=False)[:5]


# My Solution
# Category
# animals           74965
# science           71168
# healthy eating    69339
# technology        68738
# food              66676

In [None]:
#  What is aggregate scores of all Content Categories?

# df is panda series with indexes as Category and array of aggregate scores 
df = merged_df.groupby(by='Category')['Score'].sum()

# sorted_df is df series with top 5 categories 
top5_df  = df.sort_values(ascending=False)[:5]

# list of colors: top 5 is red and remaining blue 
bar_colors = ['red' if category in top5_df.index else 'blue' for category in df.index]
plt.bar(df.index, df, color=bar_colors)
plt.xticks(rotation=90)
plt.xlabel(df.index.name)
plt.ylabel('Aggregate Scores')
plt.title('Content_Category vs Scores')
plt.show()

In [40]:
# How is the trend of Content popularity by time duration?
merged_df['Datetime'] = pd.to_datetime(merged_df['Datetime'])


In [None]:
# Month with the highest Scores
merged_df.groupby([merged_df['Datetime'].dt.to_period('m')])['Score'].sum().plot()
plt.xlabel('Month')
plt.ylabel('Aggregate Scores')
plt.title('Scores of each month')

# May

In [None]:
# Which Content type is mostly used by the users?

# df is panda series with indexes as Content_Type and array of aggregate scores 
df1 = merged_df.groupby(by='Content_Type')['Score'].sum()

plt.bar(df1.index, df1)
plt.xlabel(df1.index.name)
plt.ylabel('Aggregate Scores')
plt.title('Content_Type vs Scores')
plt.show()


In [None]:
fig,ax = plt.subplots()
d = merged_df.query('Category == "animals"').groupby(by='Content_Type')['Score'].sum()
plt.pie(d,labels=d.index,autopct='%1.1f%%')
plt.title('Content_Type % of "animals" Category')