# Viral Social-Media Trends and Engagement Analysis

## Importing Libraries

In [37]:
import pandas as pd
import numpy as np

## 1. Extract: Loading the Data

In [38]:
# Extract from CSV
df = pd.read_csv('Viral_Social_Media_Trends.csv')


## 2. Cleanse: Data Cleaning and Preparation

### Basic data Inspection

In [39]:
# Data types and non-null counts
print(df.info())  

# Statistical summary
print("Statistical Summary:\n")
print(df.describe()) 

# First few rows
print("First few rows of dataset:\n")
print(df.head())  


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Post_ID           5000 non-null   object
 1   Platform          5000 non-null   object
 2   Hashtag           5000 non-null   object
 3   Content_Type      5000 non-null   object
 4   Region            5000 non-null   object
 5   Views             5000 non-null   int64 
 6   Likes             5000 non-null   int64 
 7   Shares            5000 non-null   int64 
 8   Comments          5000 non-null   int64 
 9   Engagement_Level  5000 non-null   object
dtypes: int64(4), object(6)
memory usage: 390.8+ KB
None
Statistical Summary:

              Views          Likes        Shares      Comments
count  5.000000e+03    5000.000000   5000.000000   5000.000000
mean   2.494066e+06  251475.029800  50519.562000  24888.393800
std    1.459490e+06  144349.583384  29066.362671  14284.504319
min    1.2

### Handle missing values

In [40]:
# Check for missing values
print("\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
Post_ID             0
Platform            0
Hashtag             0
Content_Type        0
Region              0
Views               0
Likes               0
Shares              0
Comments            0
Engagement_Level    0
dtype: int64


In [41]:
# Check for duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")

Duplicate rows: 0


In [42]:
# Convert data types if needed
df['Views'] = pd.to_numeric(df['Views'], errors='coerce')
df['Likes'] = pd.to_numeric(df['Likes'], errors='coerce')
df['Shares'] = pd.to_numeric(df['Shares'], errors='coerce')
df['Comments'] = pd.to_numeric(df['Comments'], errors='coerce')

In [44]:
# Handle any remaining missing values after conversion
df = df.dropna()

In [45]:
# Check for outliers in numerical columns
print("\nOutliers in numerical columns:")
numeric_cols = ['Views', 'Likes', 'Shares', 'Comments']
print(df[numeric_cols].describe())


Outliers in numerical columns:
              Views          Likes        Shares      Comments
count  5.000000e+03    5000.000000   5000.000000   5000.000000
mean   2.494066e+06  251475.029800  50519.562000  24888.393800
std    1.459490e+06  144349.583384  29066.362671  14284.504319
min    1.266000e+03     490.000000     52.000000     18.000000
25%    1.186207e+06  126892.250000  25029.000000  12305.250000
50%    2.497373e+06  249443.000000  50839.500000  25004.000000
75%    3.759781e+06  373970.750000  75774.250000  37072.750000
max    4.999430e+06  499922.000000  99978.000000  49993.000000


## 3. Transform

Now let's transform the data for analysis:

In [46]:
# Create new derived columns
df['Engagement_Rate'] = (df['Likes'] + df['Shares'] + df['Comments']) / df['Views'] * 100

In [48]:
# Categorize posts by popularity based on views
df['Popularity'] = pd.cut(df['Views'],
                          bins=[0, 1000000, 3000000, float('inf')],
                          labels=['Low', 'Medium', 'High'])

In [50]:
# Create platform-specific metrics
platform_metrics = df.groupby('Platform').agg({
    'Views': ['mean', 'median', 'sum'],
    'Likes': 'mean',
    'Shares': 'mean',
    'Comments': 'mean',
    'Engagement_Rate': 'mean'
}).round(2)

# Hashtag analysis
hashtag_analysis = df.groupby('Hashtag').agg({
    'Views': 'mean',
    'Engagement_Rate': 'mean',
    'Post_ID': 'count'
}).rename(columns={'Post_ID': 'Post_Count'}).sort_values('Post_Count', ascending=False)

# Region analysis
region_analysis = df.groupby('Region').agg({
    'Views': 'mean',
    'Engagement_Rate': 'mean',
    'Post_ID': 'count'
}).rename(columns={'Post_ID': 'Post_Count'}).sort_values('Post_Count', ascending=False)

# Content type analysis
content_type_analysis = df.groupby('Content_Type').agg({
    'Views': 'mean',
    'Engagement_Rate': 'mean',
    'Post_ID': 'count'
}).rename(columns={'Post_ID': 'Post_Count'}).sort_values('Post_Count', ascending=False)

# Engagement level analysis
engagement_level_analysis = df.groupby('Engagement_Level').agg({
    'Views': 'mean',
    'Engagement_Rate': 'mean',
    'Post_ID': 'count'
}).rename(columns={'Post_ID': 'Post_Count'}).sort_values('Post_Count', ascending=False)

## 4. Saving Processed Data

In [51]:
# Save cleaned and transformed data
df.to_csv('processed_social_media_trends.csv', index=False)

# Save analysis results
platform_metrics.to_csv('platform_metrics.csv')
hashtag_analysis.to_csv('hashtag_analysis.csv')
region_analysis.to_csv('region_analysis.csv')
content_type_analysis.to_csv('content_type_analysis.csv')
engagement_level_analysis.to_csv('engagement_level_analysis.csv')