<a href="https://colab.research.google.com/github/jahidurmahim/GHG-CO2-Agrifood-prediction/blob/main/Data_Processing_for_GHGAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Import Libraries**
This cell imports the necessary Python libraries for data manipulation, visualization, and numerical operations.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

### **Load Dataset**

In [None]:
df=pd.read_csv('/content/drive/MyDrive/FAO_GreenHouseGasEmission.csv')

### **Display First Rows**
This cell displays the first 5 rows of the DataFrame `df` to get a quick overview of the data.

In [None]:
df.head()

### **Check DataFrame Shape**
This cell prints the number of rows and columns in the DataFrame `df`.

In [None]:
df.shape

### **Display DataFrame Information**
This cell provides a summary of the DataFrame, including column names, non-null counts, and data types, to identify potential issues like missing values or incorrect data types.

In [None]:
df.info()

### **Check for Missing Values**
This cell calculates and displays the count of missing values for each column in the DataFrame `df`.

In [None]:
df.isna().sum()

### **Check for Duplicate Rows**
This cell counts and displays the number of duplicate rows in the DataFrame `df`.

In [None]:
df.duplicated().sum()

### **Print Column Names**
This cell prints all column names in the DataFrame `df`.

In [None]:
print(df.columns)

### **Descriptive Statistics of Numeric Features**
This cell generates descriptive statistics (count, mean, std, min, 25%, 50%, 75%, max) for all numeric columns in the DataFrame `df`.

In [None]:
df.describe().T

### **Outlier Detection using IQR Method**
This cell identifies outliers in numeric columns using the Interquartile Range (IQR) method and summarizes the findings.

In [None]:
import pandas as pd

outlier_summary = []

numeric_cols = df.select_dtypes(include='number').columns

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[col] < lower) | (df[col] > upper)][col].values
    outlier_count = len(outliers)

    if outlier_count > 0:
        outlier_summary.append({
            'feature': col,
            'Q1': Q1,
            'Q3': Q3,
            'lower_bound': lower,
            'upper_bound': upper,
            'outlier_count': outlier_count,
            'outlier_values': outliers
        })

outlier_df = pd.DataFrame(outlier_summary)
outlier_df

### **Visualize Numeric Feature Distributions and Outliers**
This cell generates box plots for all numeric features to visually inspect their distributions and identify outliers.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import math

numeric_df = df.select_dtypes(include=['number'])

columns_to_plot = []
for col in numeric_df.columns:
    if col == 'COMMENT_OBS' and df[col].isnull().all():
        continue
    if df[col].nunique() > 1:
        columns_to_plot.append(col)


if not columns_to_plot:
    print("No suitable numeric columns found for boxplot visualization.")
else:
    num_features = len(columns_to_plot)
    num_cols = 3
    num_rows = math.ceil(num_features / num_cols)

    fig, axes = plt.subplots(num_rows, num_cols, figsize=(num_cols * 5, num_rows * 4))
    axes = axes.flatten()

    for i, col in enumerate(columns_to_plot):
        sns.boxplot(y=df[col], ax=axes[i])
        axes[i].set_title(col)
        axes[i].set_ylabel('')

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.suptitle('Distribution and Outliers of Numeric Features', y=1.02, fontsize=16)
    plt.show()

### **Outlier Capping**
This cell applies outlier capping to the numeric columns in the DataFrame `df_capped` using the IQR method, limiting values to within 1.5 times the IQR from the quartiles.

In [None]:
import numpy as np

df_capped = df.copy()

numeric_cols_for_capping = df_capped.select_dtypes(include=np.number).columns

print("Applying outlier capping to the following numeric columns:")
for col in numeric_cols_for_capping:
    if col == 'COMMENT_OBS':
        continue

    Q1 = df_capped[col].quantile(0.25)
    Q3 = df_capped[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_capped[col] = df_capped[col].clip(lower=lower_bound, upper=upper_bound)
    print(f"- {col}: Outliers capped between {lower_bound:.2f} and {upper_bound:.2f}")

print("\nOutlier capping process completed for all numeric columns.")
print("\nHead of DataFrame after outlier capping (df_capped):")
display(df_capped.head())

### **Verify Outlier Capping for TIME_PERIOD**
This cell displays descriptive statistics for the 'TIME_PERIOD' column after outlier capping to verify the changes.

In [None]:
display(df_capped['TIME_PERIOD'].describe().T)

### **Verify Outlier Capping for OBS_VALUE**
This cell displays descriptive statistics for the 'OBS_VALUE' column after outlier capping to verify the changes.

In [None]:
display(df_capped['OBS_VALUE'].describe().T)

### **Drop Unnecessary Columns**
This cell drops a predefined list of columns that are not considered relevant for further analysis from the DataFrame `df_cleaned`.

In [None]:
columns_to_drop = [
    'STRUCTURE', 'STRUCTURE_ID', 'ACTION', 'FREQ', 'REF_AREA',
    'INDICATOR', 'SEX', 'AGE', 'URBANISATION', 'UNIT_MEASURE',
    'COMP_BREAKDOWN_1', 'COMP_BREAKDOWN_2', 'COMP_BREAKDOWN_3',
    'DATABASE_ID', 'UNIT_MULT', 'UNIT_TYPE', 'TIME_FORMAT',
    'OBS_STATUS', 'OBS_CONF',
    'FREQ_LABEL',
    'Breakdown_1', 'Breakdown_2', 'Breakdown_3'
]

df_cleaned = df_renamed.drop(columns=columns_to_drop, errors='ignore')

print("DataFrame columns after dropping unnecessary columns:")
print(df_cleaned.columns.tolist())

print("\nFirst 5 rows of the cleaned DataFrame:")
display(df_cleaned.head())

### **Rename Columns for Clarity**
This cell renames several columns in `df_cleaned` to more descriptive and user-friendly names, creating `df_final`.

In [None]:
final_rename_mapping = {
    'Unit_Multiplier': 'Unit_Mult',
    'Observation_Status': 'Obs_Status',
    'Observation_Confidence': 'Obs_Conf'
}

df_final = df_cleaned.rename(columns=final_rename_mapping)

print("Final DataFrame columns after further renaming:")
print(df_final.columns.tolist())

print("\nFirst 5 rows of the final cleaned and renamed DataFrame:")
display(df_final.head())

### **Min-Max Normalization**
This cell applies Min-Max normalization to selected numeric columns in a copy of the DataFrame (`df_normalized`), scaling values to a range between 0 and 1.

In [None]:
from sklearn.preprocessing import MinMaxScaler

df_normalized = df_capped.copy()

numeric_cols_to_normalize = [
    col for col in df_normalized.select_dtypes(include=['number']).columns
    if df_normalized[col].nunique() > 1 and col not in ['UNIT_MULT', 'TIME_FORMAT']
]

scaler = MinMaxScaler()

df_normalized[numeric_cols_to_normalize] = scaler.fit_transform(df_normalized[numeric_cols_to_normalize])

print("DataFrame after Min-Max normalization:")
display(df_normalized.head())

### **Standardization (Z-score Scaling)**
This cell applies standardization (Z-score scaling) to selected numeric columns in a copy of the DataFrame (`df_standardized`), transforming data to have a mean of 0 and a standard deviation of 1.

In [None]:
from sklearn.preprocessing import StandardScaler

df_standardized = df_final.copy()

numeric_cols_to_standardize = [
    col for col in df_standardized.select_dtypes(include=['number']).columns
    if df_standardized[col].nunique() > 1 and col not in ['Unit_Mult', 'Time_Format']
]

scaler = StandardScaler()

df_standardized[numeric_cols_to_standardize] = scaler.fit_transform(df_standardized[numeric_cols_to_standardize])

print("DataFrame after Standardization:")
display(df_standardized.head())