In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("CO2_emission_by_countries.csv", encoding='latin-1')  # or 'cp1252', 'ISO-8859-1', etc.
df.head()

Unnamed: 0,Country,Code,Calling Code,Year,CO2 emission (Tons),Population(2022),Area,% of World,Density(km2)
0,Afghanistan,AF,93,1750,0.0,41128771.0,652230.0,0.40%,63/km²
1,Afghanistan,AF,93,1751,0.0,41128771.0,652230.0,0.40%,63/km²
2,Afghanistan,AF,93,1752,0.0,41128771.0,652230.0,0.40%,63/km²
3,Afghanistan,AF,93,1753,0.0,41128771.0,652230.0,0.40%,63/km²
4,Afghanistan,AF,93,1754,0.0,41128771.0,652230.0,0.40%,63/km²


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59620 entries, 0 to 59619
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Country              59620 non-null  object 
 1   Code                 57452 non-null  object 
 2   Calling Code         56097 non-null  object 
 3   Year                 59620 non-null  int64  
 4   CO2 emission (Tons)  59620 non-null  float64
 5   Population(2022)     53116 non-null  float64
 6   Area                 55284 non-null  float64
 7   % of World           55284 non-null  object 
 8   Density(km2)         53116 non-null  object 
dtypes: float64(3), int64(1), object(5)
memory usage: 4.1+ MB


In [None]:
df["Country"].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Anguilla', 'Antarctica', 'Antigua and Barbuda', 'Argentina',
       'Armenia', 'Aruba', 'Australia', 'Austria', 'Azerbaijan',
       'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
       'Belgium', 'Belize', 'Benin', 'Bermuda', 'Bhutan', 'Bolivia',
       'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei',
       'Bulgaria', 'Burkina Faso', 'Burundi', 'Cambodia', 'Cameroon',
       'Canada', 'Cape Verde', 'Central African Republic', 'Chad',
       'Chile', 'China', 'Christmas Island', 'Colombia', 'Comoros',
       'Congo', 'Cook Islands', 'Costa Rica', "Cote d'Ivoire", 'Croatia',
       'Cuba', 'Curacao', 'Cyprus', 'Czechia',
       'Democratic Republic of Congo', 'Denmark', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
       'Faeroe Islands', 'Fiji', 'Finland', 'France', 'French G

In [None]:
df.isnull().sum()

Unnamed: 0,0
Country,0
Code,2168
Calling Code,3523
Year,0
CO2 emission (Tons),0
Population(2022),6504
Area,4336
% of World,4336
Density(km2),6504


In [None]:
df.drop(columns=["Calling Code"],inplace=True)

In [None]:
df.head()

Unnamed: 0,Country,Code,Year,CO2 emission (Tons),Population(2022),Area,% of World,Density(km2)
0,Afghanistan,AF,1750,0.0,41128771.0,652230.0,0.40%,63/km²
1,Afghanistan,AF,1751,0.0,41128771.0,652230.0,0.40%,63/km²
2,Afghanistan,AF,1752,0.0,41128771.0,652230.0,0.40%,63/km²
3,Afghanistan,AF,1753,0.0,41128771.0,652230.0,0.40%,63/km²
4,Afghanistan,AF,1754,0.0,41128771.0,652230.0,0.40%,63/km²


In [None]:
df.drop(columns=["Code"],inplace=True)

In [None]:
df.head()

Unnamed: 0,Country,Year,CO2 emission (Tons),Population(2022),Area,% of World,Density(km2)
0,Afghanistan,1750,0.0,41128771.0,652230.0,0.40%,63/km²
1,Afghanistan,1751,0.0,41128771.0,652230.0,0.40%,63/km²
2,Afghanistan,1752,0.0,41128771.0,652230.0,0.40%,63/km²
3,Afghanistan,1753,0.0,41128771.0,652230.0,0.40%,63/km²
4,Afghanistan,1754,0.0,41128771.0,652230.0,0.40%,63/km²


In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

for i in ["Code","Calling Code","Population(2022)","Area","% of World","Density(km2)"]:
  df.fillna(i,inplace=True)

In [None]:
df.isnull().sum()

Unnamed: 0,0
Country,0
Year,0
CO2 emission (Tons),0
Population(2022),0
Area,0
% of World,0
Density(km2),0


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
for column in ["Country", "% of World", "Density(km2)", "Population(2022)", "Area"]:
    # Convert to string type to ensure all values are strings
    df[column] = df[column].astype(str)

    # Now apply Label Encoding
     #Create a new LabelEncoder for each column
    df[column] = label_encoder.fit_transform(df[column])


In [None]:
df.head()

Unnamed: 0,Country,Year,CO2 emission (Tons),Population(2022),Area,% of World,Density(km2)
0,0,1750,0.0,41128771.0,652230.0,4,115
1,0,1751,0.0,41128771.0,652230.0,4,115
2,0,1752,0.0,41128771.0,652230.0,4,115
3,0,1753,0.0,41128771.0,652230.0,4,115
4,0,1754,0.0,41128771.0,652230.0,4,115


In [None]:
from sklearn.model_selection import train_test_split
x = df.drop(columns=["CO2 emission (Tons)"])  # Features
y = df["CO2 emission (Tons)"]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59620 entries, 0 to 59619
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Country              59620 non-null  int64  
 1   Year                 59620 non-null  int64  
 2   CO2 emission (Tons)  59620 non-null  float64
 3   Population(2022)     59620 non-null  int64  
 4   Area                 59620 non-null  int64  
 5   % of World           59620 non-null  int64  
 6   Density(km2)         59620 non-null  int64  
dtypes: float64(1), int64(6)
memory usage: 3.2 MB


In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)

In [None]:
ypred = model.predict(xtest)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(ytest, ypred)
mse = mean_squared_error(ytest, ypred)
r2 = r2_score(ytest, ypred)

# Print results
print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

Mean Absolute Error: 2252538590.0651608
Mean Squared Error: 1.2778894477704305e+20
R-squared Score: 0.0260660589278513
