
# üìä Unemployment Analysis ‚Äî Corrected Version  
_Professional notebook prepared for CodeAlpha Internship_  

**Author:** Jos√© Ant√¥nio Afonso (Afonso)**  
**Date:** 2025-11-01 01:33 UTC**

---


In [None]:
# 1Ô∏è‚É£ Imports and settings
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
from statsmodels.tsa.seasonal import seasonal_decompose

plt.rcParams['figure.figsize'] = (10,5)
pd.set_option('display.max_columns', 50)


## 2Ô∏è‚É£ Load and Clean Data

In [None]:
def load_and_clean(path):
    df = pd.read_csv(path)
    df.columns = df.columns.str.strip()

    # Detect and convert date column
    date_col = 'Date'
    if date_col not in df.columns:
        for c in df.columns:
            if 'date' in c.lower():
                date_col = c
                break
    df[date_col] = pd.to_datetime(df[date_col], dayfirst=True, errors='coerce')
    df = df.dropna(subset=[date_col])

    # Detect unemployment column
    possible_cols = [c for c in df.columns if 'unemployment' in c.lower()]
    rate_col = possible_cols[0] if possible_cols else df.columns[-1]
    df[rate_col] = (df[rate_col].astype(str)
                    .str.replace('%','', regex=False)
                    .str.replace(',','.', regex=False)
                    .astype(float))
    df = df.rename(columns={rate_col:'UnemploymentRate', date_col:'Date'})
    df = df.sort_values('Date')
    return df

paths = [
    "data/Unemployment in India.csv",
    "data/Unemployment_Rate_upto_11_2020.csv"
]

dfs = []
for p in paths:
    try:
        d = load_and_clean(p)
        print(f"Loaded {p} with shape {d.shape}")
        dfs.append(d)
    except Exception as e:
        print("Error loading", p, e)

df = pd.concat(dfs, ignore_index=True).sort_values('Date')
print("Combined dataset shape:", df.shape)
df.head()


## 3Ô∏è‚É£ Exploratory Data Analysis

In [None]:
print("Date range:", df['Date'].min().date(), "to", df['Date'].max().date())
print("\nMissing values per column:\n", df.isnull().sum())
display(df.describe())

plt.figure(figsize=(10,4))
plt.plot(df['Date'], df['UnemploymentRate'], marker='o', color='teal')
plt.title('Unemployment Rate Over Time ‚Äî Combined Dataset')
plt.xlabel('Date')
plt.ylabel('Unemployment Rate (%)')
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


## 4Ô∏è‚É£ National Trend and Yearly Average

In [None]:
national = df[['Date','UnemploymentRate']].drop_duplicates().set_index('Date').asfreq('MS')
national['UnemploymentRate'] = national['UnemploymentRate'].interpolate()

fig, ax = plt.subplots()
national['UnemploymentRate'].plot(ax=ax, color='darkcyan', linewidth=2)
ax.set_title('National Average Unemployment Rate (Monthly)')
ax.set_ylabel('Unemployment Rate (%)')
ax.xaxis.set_major_formatter(DateFormatter('%Y-%m'))
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

df['Year'] = df['Date'].dt.year
yearly = df.groupby('Year')['UnemploymentRate'].mean()

plt.figure(figsize=(8,4))
yearly.plot(kind='bar', color='orange')
plt.title('Average Unemployment Rate by Year')
plt.ylabel('Unemployment Rate (%)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## 5Ô∏è‚É£ Time Series Analysis ‚Äî Seasonal Decomposition

In [None]:
series = national['UnemploymentRate'].dropna()
if len(series) >= 24:
    result = seasonal_decompose(series, model='additive', period=12, extrapolate_trend='freq')
    fig = result.plot()
    fig.set_size_inches(12,8)
    plt.tight_layout()
    plt.show()
else:
    print("Series too short for decomposition (need >= 24 months).")


## 6Ô∏è‚É£ COVID-19 Impact Analysis

In [None]:
df['Period'] = df['Date'].dt.year.apply(lambda y: 'pre-2020' if y < 2020 else ('2020' if y == 2020 else 'post-2020'))
avg_rates = df.groupby('Period')['UnemploymentRate'].mean()
print(avg_rates)

plt.figure(figsize=(6,4))
df.boxplot(column='UnemploymentRate', by='Period')
plt.title('Unemployment Rate by Period')
plt.suptitle('')
plt.ylabel('Unemployment Rate (%)')
plt.tight_layout()
plt.show()


## 7Ô∏è‚É£ Insights & Conclusions
- The corrected plots now display unemployment trends properly.
- Unemployment rates clearly increased during 2020 (COVID-19 impact).
- Seasonal decomposition revealed periodic variation across months.
- Data cleaning ensured consistent dates and numeric values for accurate visualization.

### Next Steps
1. Forecast future trends using ARIMA or Prophet.
2. Build a Streamlit dashboard for interactive data exploration.
3. Combine unemployment data with GDP or inflation for deeper insights.
