# ACCENTURE DATA ANALYTICS VIRTUAL INTERNSHIP

## Data Cleaning

In [1]:
# Import the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Content Dataset

In [43]:
# Import dataset
df_content = pd.read_csv('Downloads/Content.csv')

# View the head 
df_content.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Content Type,Category,URL
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [44]:
#Find the total number of rows
df_content.shape

(1000, 6)

In [45]:
# Find missing values
df_content.isnull().sum()

Unnamed: 0        0
Content ID        0
User ID           0
Content Type      0
Category          0
URL             199
dtype: int64

In [46]:
# Check data type
df_content.dtypes

Unnamed: 0       int64
Content ID      object
User ID         object
Content Type    object
Category        object
URL             object
dtype: object

In [47]:
# Drop the 'unnamed: 0' as it is not relevant to our analysis
del df_content['Unnamed: 0']
df_content.head()

Unnamed: 0,Content ID,User ID,Content Type,Category,URL
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying,https://socialbuzz.cdn.com/content/storage/975...
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/9f7...
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating,https://socialbuzz.cdn.com/content/storage/230...
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology,https://socialbuzz.cdn.com/content/storage/356...
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food,https://socialbuzz.cdn.com/content/storage/01a...


In [48]:
# Drop the 'URL' as it is not relevant to our analysis
del df_content['URL']
df_content.head()

Unnamed: 0,Content ID,User ID,Content Type,Category
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,8d3cd87d-8a31-4935-9a4f-b319bfe05f31,photo,Studying
1,9f737e0a-3cdd-4d29-9d24-753f4e3be810,beb1f34e-7870-46d6-9fc7-2e12eb83ce43,photo,healthy eating
2,230c4e4d-70c3-461d-b42c-ec09396efb3f,a5c65404-5894-4b87-82f2-d787cbee86b4,photo,healthy eating
3,356fff80-da4d-4785-9f43-bc1261031dc6,9fb4ce88-fac1-406c-8544-1a899cee7aaf,photo,technology
4,01ab84dd-6364-4236-abbb-3f237db77180,e206e31b-5f85-4964-b6ea-d7ee5324def1,video,food


In [50]:
# Find the unique values in the Type column
df_content['Content Type'].unique()

# THis column is okay as all the values are unique

array(['photo', 'video', 'GIF', 'audio'], dtype=object)

In [51]:
# FInd the unique values in the Category column
df_content['Category'].str.lower().unique()

array(['studying', 'healthy eating', 'technology', 'food', 'cooking',
       'dogs', 'soccer', 'public speaking', 'science', 'tennis', 'travel',
       'fitness', 'education', 'veganism', 'animals', 'culture'],
      dtype=object)

In [52]:
# Save the cleaned dataset
df_content.to_csv('cleaned_content.csv', index=False)

### Reactions Dataset

In [54]:
# Import dataset
df_reactions = pd.read_csv('Downloads/Reactions.csv')

# View the head 
df_reactions.head()

Unnamed: 0.1,Unnamed: 0,Content ID,User ID,Reactions Type,Datetime
0,0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,22/04/2021 15:17
1,1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,07/11/2020 09:43
2,2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,17/06/2021 12:22
3,3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,18/04/2021 05:13
4,4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,06/01/2021 19:13


In [55]:
# Check data type
df_reactions.dtypes

Unnamed: 0         int64
Content ID        object
User ID           object
Reactions Type    object
Datetime          object
dtype: object

In [56]:
# Find missing values
df_reactions.isnull().sum()

Unnamed: 0           0
Content ID           0
User ID           3019
Reactions Type     980
Datetime             0
dtype: int64

In [57]:
#Find the total number of rows
df_reactions.shape

(25553, 5)

In [58]:
# Drop the 'unnamed: 0' as it is not relevant to our analysis
del df_reactions['Unnamed: 0']
df_reactions.head()

Unnamed: 0,Content ID,User ID,Reactions Type,Datetime
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,,22/04/2021 15:17
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,07/11/2020 09:43
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,17/06/2021 12:22
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,18/04/2021 05:13
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,06/01/2021 19:13


In [62]:
# Delete null values

df_reactions.dropna(inplace = True)

# Check again for null values
df_reactions.isnull().sum()

df_reactions.head()

Unnamed: 0,Content ID,User ID,Reactions Type,Datetime
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,5d454588-283d-459d-915d-c48a2cb4c27f,disgust,07/11/2020 09:43
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,92b87fa5-f271-43e0-af66-84fac21052e6,dislike,17/06/2021 12:22
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,163daa38-8b77-48c9-9af6-37a6c1447ac2,scared,18/04/2021 05:13
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,34e8add9-0206-47fd-a501-037b994650a2,disgust,06/01/2021 19:13
5,97522e57-d9ab-4bd6-97bf-c24d952602d2,9b6d35f9-5e15-4cd0-a8d7-b1f3340e02c4,interested,23/08/2020 12:25


In [65]:
# Change the column type of the Datetime column to the Datetime format
df_reactions['Datetime'] = pd.to_datetime(df_reactions['Datetime'], format='%d/%m/%Y %H:%M')

df_reactions.dtypes

Content ID                object
User ID                   object
Reactions Type            object
Datetime          datetime64[ns]
dtype: object

In [66]:
# Save the cleaned dataset
df_reactions.to_csv('cleaned_reactions.csv', index=False)

### ReactionTypes Dataset

In [67]:
# Import dataset
df_reactiontypes = pd.read_csv('Downloads/ReactionTypes.csv')

# View the head 
df_reactiontypes.head()

Unnamed: 0.1,Unnamed: 0,Reactions Type,Sentiment,Score
0,0,heart,positive,60
1,1,want,positive,70
2,2,disgust,negative,0
3,3,hate,negative,5
4,4,interested,positive,30


In [68]:
#Find the total number of rows
df_reactiontypes.shape

(16, 4)

In [69]:
# Find missing values
df_reactiontypes.isnull().sum()

Unnamed: 0        0
Reactions Type    0
Sentiment         0
Score             0
dtype: int64

In [70]:
# Check data type
df_reactiontypes.dtypes

Unnamed: 0         int64
Reactions Type    object
Sentiment         object
Score              int64
dtype: object

In [71]:
# Drop the 'unnamed: 0' as it is not relevant to our analysis
del df_reactiontypes['Unnamed: 0']
df_reactiontypes.head()

Unnamed: 0,Reactions Type,Sentiment,Score
0,heart,positive,60
1,want,positive,70
2,disgust,negative,0
3,hate,negative,5
4,interested,positive,30


In [72]:
# Save the cleaned dataset
df_reactiontypes.to_csv('cleaned_reactiontypes.csv', index=False)