# Data Preparation and Cleaning
---

In [20]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt # we only need pyplot
sb.set() # set the default Seaborn style for graphics
from scipy.stats import zscore

In [21]:
fashiondata = pd.read_csv('fashiondata.csv')
fashiondata.head()

Unnamed: 0,UserId,Followings,Followers,MediaCount,BrandName,BrandCategory,Hashtags,Caption,ImgURL,Likes,...,NumberOfPeople,NumberOfFashionProduct,Anger,Contempt,Disgust,Fear,Happiness,Neutral,Sadness,Surprise
0,1171579752855683619_212070047,518.0,9840.0,541,abercrombie,High street,"beautiful, summer, fashion, love, cute, food, ...","We were born to be REAL, not to be perfect. ...",https://scontent.cdninstagram.com/t51.2885-15/...,97,...,0.931486,3.42211,0.083862,0.015089,0.000583,2.6e-05,1.7e-05,0.890586,0.009657,0.000181
1,1171594777274371222_176762322,7333.0,2300.0,272,abercrombie,High street,"teen, model, brunette, selfie, hollister, snap...","Gotta run, but first, let me take selfie. Me...",https://scontent.cdninstagram.com/t51.2885-15/...,94,...,0.466329,2.91971,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,91,...,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1171407552643586413_581125501,131.0,605.0,106,abercrombie,High street,"alexandani, cute, llbean, beanboots, anthropol...",dress to impress\U0001f457,https://scontent.cdninstagram.com/t51.2885-15/...,94,...,1.1165,4.69096,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1171508187966229230_2797323089,132.0,456.0,56,abercrombie,High street,"abercrombieandfitch, shopping, love, hollister...",#love #shopping #shoppen #hollister #abercro...,https://scontent.cdninstagram.com/t51.2885-15/...,9,...,0.393063,1.45586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
fashiondata.dtypes

UserId                     object
Followings                float64
Followers                 float64
MediaCount                  int64
BrandName                  object
BrandCategory              object
Hashtags                   object
Caption                    object
ImgURL                     object
Likes                       int64
Comments                    int64
CreationTime                int64
Link                       object
Selfie                    float64
BodySnap                  float64
Marketing                 float64
ProductOnly               float64
NonFashion                float64
Face                      float64
Logo                      float64
BrandLogo                 float64
Smile                     float64
Outdoor                   float64
NumberOfPeople            float64
NumberOfFashionProduct    float64
Anger                     float64
Contempt                  float64
Disgust                   float64
Fear                      float64
Happiness     

In [35]:
fashiondata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24752 entries, 0 to 24751
Data columns (total 33 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UserId                  24752 non-null  object 
 1   Followings              24752 non-null  float64
 2   Followers               24752 non-null  float64
 3   MediaCount              24752 non-null  int64  
 4   BrandName               24752 non-null  object 
 5   BrandCategory           24752 non-null  object 
 6   Hashtags                24606 non-null  object 
 7   Caption                 24717 non-null  object 
 8   ImgURL                  24752 non-null  object 
 9   Likes                   24752 non-null  int64  
 10  Comments                24752 non-null  int64  
 11  CreationTime            24752 non-null  int64  
 12  Link                    24752 non-null  object 
 13  Selfie                  24752 non-null  float64
 14  BodySnap                24752 non-null

### Sanity check for duplicates & missing values 

In [24]:
duplicate_rows = fashiondata[fashiondata.duplicated(keep=False)]
print("Duplicate rows:")
print(duplicate_rows)

Duplicate rows:
                               UserId  Followings  Followers  MediaCount  \
100    1171299198127266963_2560516238       248.0      164.0         232   
101    1171299198127266963_2560516238       248.0      164.0         232   
120     1171231788214163629_193097469       451.0     4365.0         485   
121     1171231788214163629_193097469       451.0     4365.0         485   
529    1171022589527811636_2083414312      4998.0     4211.0        1293   
...                               ...         ...        ...         ...   
24742  1171578577234348127_1601300177      7495.0     4224.0        5558   
24744  1171578445549979740_1601300177      7495.0     4224.0        5558   
24745  1171578445549979740_1601300177      7495.0     4224.0        5558   
24749  1171566525099211846_1433754542       209.0     8737.0       46254   
24750  1171566525099211846_1433754542       209.0     8737.0       46254   

         BrandName BrandCategory  \
100    abercrombie   High street   

In [25]:
missing_values = fashiondata.isnull().sum()
print("\nMissing values:")
print(missing_values)


Missing values:
UserId                      0
Followings                  0
Followers                   0
MediaCount                  0
BrandName                   0
BrandCategory               0
Hashtags                  146
Caption                    35
ImgURL                      0
Likes                       0
Comments                    0
CreationTime                0
Link                        0
Selfie                      0
BodySnap                    0
Marketing                   0
ProductOnly                 0
NonFashion                  0
Face                        0
Logo                        0
BrandLogo                   0
Smile                       0
Outdoor                     0
NumberOfPeople              0
NumberOfFashionProduct      0
Anger                       0
Contempt                    0
Disgust                     0
Fear                        0
Happiness                   0
Neutral                     0
Sadness                     0
Surprise               

When we check for duplicate rows - those duplicates are reflected in the userID and consequently the user's details, which indicates the same user may have posted multiple posts included in the dataset. For missing values, those are only present in the string variables, Hashtags and Captions, which are not the focus of our analysis. Hence, no rows/values are removed.

### Aggregating datasets by unique brand category 

In [26]:
condition = (fashiondata.dtypes == np.int64) | (fashiondata.dtypes == np.float64)
fashiondatanum = fashiondata.loc[:, condition]
print("Data dims : ", fashiondata.shape) 
fashiondatanum.info()

Data dims :  (24752, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24752 entries, 0 to 24751
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Followings              24752 non-null  float64
 1   Followers               24752 non-null  float64
 2   MediaCount              24752 non-null  int64  
 3   Likes                   24752 non-null  int64  
 4   Comments                24752 non-null  int64  
 5   CreationTime            24752 non-null  int64  
 6   Selfie                  24752 non-null  float64
 7   BodySnap                24752 non-null  float64
 8   Marketing               24752 non-null  float64
 9   ProductOnly             24752 non-null  float64
 10  NonFashion              24752 non-null  float64
 11  Face                    24752 non-null  float64
 12  Logo                    24752 non-null  float64
 13  BrandLogo               24752 non-null  float64
 14  Smile        

In [27]:
#declaring data frames based on different values of BrandCategory
#separating BranCategory into different variable types
unique_brandcategory = fashiondata['BrandCategory'].unique()

dfs_by_brandcategory = {}

for BrandCategory in unique_brandcategory:
    dfs_by_brandcategory[BrandCategory] = fashiondata[fashiondata['BrandCategory'] == BrandCategory]

#printing to check if our separation of BrandCategory by variable worked 
for BrandCategory, df in dfs_by_brandcategory.items():
    print(f"BrandCategory: {BrandCategory}")
    print(df)

BrandCategory: High street
                               UserId  Followings  Followers  MediaCount  \
0       1171579752855683619_212070047       518.0     9840.0         541   
1       1171594777274371222_176762322      7333.0     2300.0         272   
2       1171407552643586413_581125501       131.0      605.0         106   
3       1171407552643586413_581125501       131.0      605.0         106   
4      1171508187966229230_2797323089       132.0      456.0          56   
...                               ...         ...        ...         ...   
24747   1171575731978099465_389885373       459.0     3236.0        1318   
24748   1171566570370586806_253427645       209.0     2563.0         469   
24749  1171566525099211846_1433754542       209.0     8737.0       46254   
24750  1171566525099211846_1433754542       209.0     8737.0       46254   
24751   1171565778125281443_253427645       209.0     2563.0         469   

         BrandName BrandCategory  \
0      abercrombie   Hig

Our aim is to analyze characteristics of a well-performing social media fashion post for each tier of brand category hence aggregating brands of a certain brand category and their subsequent posts to separate datasets.

 ### Function to remove outliers

In [36]:
# Function to remove outliers using IQR
def remove_outliers(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    return df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

### Segregating numeric variable by each brand category and use the dtype filter to extract the numeric data

In [30]:
dfs_wo_outliers = {}
for category in unique_brandcategory:
    category_data = dfs_by_brandcategory[category]
    numeric_data = category_data.select_dtypes(include=[np.number])  # Ensure only numeric data is considered
    cleaned_data = remove_outliers(numeric_data)  # Remove outliers from numeric data
    dfs_wo_outliers[category] = cleaned_data  # Compute and store correlations

### Creating dataframes based on each brand category pre and post outlier removal 

In [33]:
# Load the CSV files into a DataFrame
highstreetnum.to_csv('highstreetnum.csv')
designernum.to_csv('designernum.csv')
megacouturenum.to_csv('megacouturenum.csv')
smallcouturenum.to_csv('smallcouturenum.csv')

In [34]:
# Load the CSV files (cleaned ones) into a DataFrame
dfs_wo_outliers['High street'].to_csv('highstreetnum_wo_outliers.csv')
dfs_wo_outliers['Designer'].to_csv('designernum_wo_outliers.csv')
dfs_wo_outliers['Mega couture'].to_csv('megacouturenum_wo_outliers.csv')
dfs_wo_outliers['Small couture'].to_csv('smallcouturenum_wo_outliers.csv')