In [1]:
# Import necessary libraries

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing datasets

content = pd.read_csv('data/Content.csv', index_col = 0)
reactions = pd.read_csv('data/Reactions.csv', index_col = 0)
reactiontypes = pd.read_csv('data/ReactionTypes.csv', index_col = 0)

In [3]:
# content dataset

content.head()

Unnamed: 0,Content ID,User ID,Type,Category,URL
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [4]:
# reactions dataset

reactions.head()

Unnamed: 0,Content ID,User ID,Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,22/04/2021 15:17
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,07/11/2020 9:43
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,17/06/2021 12:22
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,18/04/2021 5:13
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,06/01/2021 19:13


In [5]:
# reaction types dataset

reactiontypes.head()

Unnamed: 0,Type,Sentiment,Score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30


In [6]:
# reactions has the most number of rows, all has reasonable column number

list((reactiontypes.shape, reactions.shape, content.shape))

[(16, 3), (25553, 4), (1000, 5)]

---

# Tasks
- removing rows that have values which are missing
- changing the datatype of some values within a column, and
- removing columns which are not relevant to the task
    - think about how each column might be relevant to the business question you're investigating

In [7]:
# Checking data types of each column per dataset

list((content.columns, reactions.columns, reactiontypes.columns))

[Index(['Content ID', 'User ID', 'Type', 'Category', 'URL'], dtype='object'),
 Index(['Content ID', 'User ID', 'Type', 'Datetime'], dtype='object'),
 Index(['Type', 'Sentiment', 'Score'], dtype='object')]

### Desired Output Columns

- Content ID
- User ID
- Reactions Type
- Reactions Datetime
- Content Type
- Content Category
- ReactionType Sentiment
- ReactionType Score

---

## Cleaning 'Content' Dataset

In [8]:
# checks missing values 

content.isnull().sum()

Content ID      0
User ID         0
Type            0
Category        0
URL           199
dtype: int64

In [9]:
# dropping unnecessary column for analysis
# remove 'URL' in content
content.drop('URL', axis = 1, inplace = True)
content.head()

Unnamed: 0,Content ID,User ID,Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food


In [10]:
# issue: format (quotation marks)
#        format (capitalization, duplicate 'entries')

content['Category'].value_counts()

technology           71
animals              67
travel               67
science              63
culture              63
healthy eating       61
food                 61
fitness              61
cooking              60
tennis               58
soccer               58
education            57
dogs                 56
studying             55
veganism             48
public speaking      48
Fitness               5
Science               4
Animals               4
Soccer                3
"soccer"              3
"culture"             3
"dogs"                2
Education             2
Studying              2
Food                  2
Travel                2
"studying"            1
Healthy Eating        1
Veganism              1
Culture               1
"science"             1
"animals"             1
"public speaking"     1
"technology"          1
"food"                1
Public Speaking       1
"tennis"              1
"cooking"             1
Technology            1
"veganism"            1
Name: Category, 

In [11]:
# remove quotation marks 
content['Category'] = content['Category'].str.replace(r"[\"\',]", '')

# all lowercase, uniform content category
content['Category'] = content['Category'].str.lower()

In [12]:
# cleaned content category

content['Category'].value_counts()

technology         73
animals            72
travel             69
science            68
culture            67
fitness            66
soccer             64
food               64
healthy eating     62
cooking            61
education          59
tennis             59
studying           58
dogs               58
veganism           50
public speaking    50
Name: Category, dtype: int64

In [13]:
# renaming 'Type' column to 'Content Type'

content.rename(columns = {'Type': 'Content Type'}, inplace = True)

In [14]:
content.drop('User ID', axis = 1, inplace = True)

In [15]:
content.head()

Unnamed: 0,Content ID,Content Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food


----

## Cleaning 'Reactions' Dataset

In [16]:
reactions.head()

Unnamed: 0,Content ID,User ID,Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,22/04/2021 15:17
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,07/11/2020 9:43
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,17/06/2021 12:22
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,18/04/2021 5:13
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,06/01/2021 19:13


In [17]:
# goal is to know which content performs well
# dropping column not necessary for analysis

reactions.drop('User ID', axis = 1, inplace = True)

In [18]:
# renaming 'Type' column to 'Reaction Type'

reactions.rename(columns = {'Type': 'Reaction Type'}, inplace = True)

In [19]:
reactions.head()

Unnamed: 0,Content ID,Reaction Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,22/04/2021 15:17
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,07/11/2020 9:43
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,17/06/2021 12:22
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,18/04/2021 5:13
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,06/01/2021 19:13


In [20]:
# check for missing values

reactions.isnull().sum()

Content ID         0
Reaction Type    980
Datetime           0
dtype: int64

In [21]:
reactions.dropna(inplace = True)

In [22]:
reactions.shape

(24573, 3)

---

## Cleaning 'Reaction Types' Dataset

In [23]:
# renaming 'Type' column to 'Reaction Type'

reactiontypes.rename(columns = {'Type': 'Reaction Type'}, inplace = True)

In [24]:
reactiontypes.head()

Unnamed: 0,Reaction Type,Sentiment,Score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30


---

## Data Modelling

### Tasks
1. Create a final data set by merging your three tables together
    - Using the Reaction table as the base table, then first join the relevant columns from the Content data set, and then the Reaction Types data set.
2. Figure out the Top 5 performing categories
    - Add up the total scores for each category.

In [25]:
reactions.head()

Unnamed: 0,Content ID,Reaction Type,Datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,07/11/2020 9:43
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,17/06/2021 12:22
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,18/04/2021 5:13
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,06/01/2021 19:13
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,23/08/2020 12:25


In [26]:
content.head()

Unnamed: 0,Content ID,Content Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,photo,studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,video,food


In [27]:
reactions_content = pd.merge(reactions, content, on = 'Content ID')
reactions_content.head()

Unnamed: 0,Content ID,Reaction Type,Datetime,Content Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,07/11/2020 9:43,photo,studying
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,17/06/2021 12:22,photo,studying
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,18/04/2021 5:13,photo,studying
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,06/01/2021 19:13,photo,studying
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,interested,23/08/2020 12:25,photo,studying


In [28]:
final_dataset = pd.merge(reactions_content, reactiontypes, on = 'Reaction Type')
final_dataset

Unnamed: 0,Content ID,Reaction Type,Datetime,Content Type,Category,Sentiment,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,07/11/2020 9:43,photo,studying,negative,0
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,06/01/2021 19:13,photo,studying,negative,0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,09/04/2021 2:46,photo,studying,negative,0
3,9f737e0a-3cdd-4d29-9d24-753f4e3be810,disgust,28/03/2021 21:15,photo,healthy eating,negative,0
4,230c4e4d-70c3-461d-b42c-ec09396efb3f,disgust,04/08/2020 5:40,photo,healthy eating,negative,0
...,...,...,...,...,...,...,...
24568,435007a5-6261-4d8b-b0a4-55fdc189754b,adore,04/10/2020 22:26,audio,veganism,positive,72
24569,435007a5-6261-4d8b-b0a4-55fdc189754b,adore,18/09/2020 10:50,audio,veganism,positive,72
24570,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,adore,31/10/2020 3:58,GIF,culture,positive,72
24571,4e4c9690-c013-4ee7-9e66-943d8cbd27b7,adore,25/06/2020 15:12,GIF,culture,positive,72


In [29]:
# exporting the final dataset 

# final_dataset.to_csv(r'C:\Users\jeyan\Desktop\Task2FinalDataset.csv', index = False, header = True, encoding = 'utf-8')

---

## Data Analysis

### Task
- The top 5 categories

In [30]:
final_dataset.groupby('Category')['Score'].sum().sort_values(ascending = False).to_frame()[:5]

Unnamed: 0_level_0,Score
Category,Unnamed: 1_level_1
animals,74965
science,71168
healthy eating,69339
technology,68738
food,66676
