In [1]:
# Libraries
from sqlalchemy import create_engine
import pandas as pd
import seaborn as sns

## Connect Database

In [2]:
# Note:: The make sure you use the information from your specific PostgreSQL installation
host = r'127.0.0.1' # denotes that the db in a local installation
db = r'MSDS610' # db we just created
user = r'postgres' # using the postgres user for this demo
pw = r'11207471' # this is the password established during installation
port = r'5432' # default port estabalished during install
schema = r'cleaned' # schema we just created

In [3]:
db_conn = create_engine(f"postgresql://{user}:{pw}@{host}:{port}/{db}")

In [4]:
table_name = r'global_warming2'

In [5]:
sql="select tables.table_name from information_schema.tables where (table_schema ='"+schema+"')order by 1;"
tbl_df = pd.read_sql(sql, db_conn, index_col=None)
tbl_df

Unnamed: 0,table_name
0,global_warming2
1,global_warming_Wth_Risk_Level
2,global_warming_cleaned


## Select Data

In [6]:
sql=r'SELECT * FROM ' + schema + '.' + table_name
gb = pd.read_sql(sql, db_conn, index_col=None)

In [7]:
gb.head(10)

Unnamed: 0,Country,Year,Temperature_Anomaly,CO2_Emissions,Population,Forest_Area,GDP,Renewable_Energy_Usage,Methane_Emissions,Sea_Level_Rise,...,Per_Capita_Emissions,Industrial_Activity,Air_Pollution_Index,Biodiversity_Index,Ocean_Acidification,Fossil_Fuel_Usage,Energy_Consumption_Per_Capita,Policy_Score,Average_Temperature,Risk_Level
0,Country_1,1900,-0.736197,273355700.0,318618800.0,15.244316,5450396000000.0,53.313474,5963399.0,21.498814,...,7.583504,60.305802,119.849255,49.569774,8.006157,55.004982,2474.748082,53.88227,6.971064,0
1,Country_1,1901,-0.703154,699844200.0,1494792000.0,54.731054,9044301000000.0,46.218131,2359518.0,6.527975,...,19.029111,33.844962,173.091676,9.140253,8.107997,56.227115,3411.43255,17.905168,32.393594,1
2,Country_1,1902,-0.269372,638765000.0,176942200.0,30.7361,3577474000000.0,81.932127,5874300.0,37.340943,...,5.724153,40.50115,163.346198,65.776186,7.923555,49.416237,1988.796272,53.597921,16.137289,2
3,Country_1,1903,1.138549,707345900.0,850466500.0,68.558736,3423279000000.0,30.812757,3934211.0,19.431326,...,6.383886,60.212029,170.589781,52.224952,7.887093,54.237972,2143.174604,35.826989,23.842348,2
4,Country_1,1904,0.272228,680618700.0,548738700.0,30.130873,3364767000000.0,42.098978,2613544.0,6.50557,...,2.562015,57.873658,110.710126,54.761718,7.730068,26.553637,4023.09477,43.621281,27.347198,2
5,Country_1,1905,0.779972,684892000.0,821803500.0,53.451252,5989337000000.0,71.220964,6835859.0,14.517514,...,11.344739,63.237688,125.723362,73.887508,7.807014,66.237886,2906.285255,50.310318,12.986597,0
6,Country_1,1906,0.749481,738482800.0,971784400.0,71.304433,6368574000000.0,40.377358,4102873.0,8.631829,...,11.720899,80.471674,129.66399,72.37568,7.897264,70.859571,1393.793605,57.001642,3.349868,1
7,Country_1,1907,-0.637168,689614000.0,876509400.0,60.596205,7779992000000.0,50.787751,2742369.0,31.410363,...,12.749148,50.260546,51.202577,25.957086,8.057036,42.843479,3123.596058,46.781841,34.239512,1
8,Country_1,1908,0.273943,396544800.0,902980600.0,35.219464,6451774000000.0,39.634114,2796150.0,29.593069,...,9.452678,31.780187,200.250576,68.295645,7.941333,52.268319,2860.103473,51.230245,14.993037,1
9,Country_1,1909,0.816333,311842300.0,1311659000.0,27.012174,8802959000000.0,37.75679,1785549.0,48.850478,...,11.54837,26.583421,48.526769,28.153486,8.138102,43.206151,3803.911623,28.900588,28.454129,1


## Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# Random Forest model

X = gb[['CO2_Emissions', 'Methane_Emissions','GDP']]

y = gb['Risk_Level'] # Dependent


# Split data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_hold, y_val, y_hold = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)


# Print dataset sizes
print("Training set:", X_train.shape)
print("Validation set:", X_val.shape)
print("Test set:", X_test.shape)

# Intitalize Rnd

rnd2 = RandomForestClassifier(n_estimators=100, random_state=42)


# Model Train

rnd2.fit(X_train, y_train)

Training set: (16657, 3)
Validation set: (3570, 3)
Test set: (7140, 3)


In [10]:

predictions = rnd2.predict(X_val)
predictions

array([0, 1, 1, ..., 2, 1, 0])

In [12]:
predictions.shape

(3570,)

In [30]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Calculate Accuracy
accuracy = accuracy_score(y_val, predictions)
print("Accuracy:", accuracy)

Accuracy: 1.0


In [22]:
# convert prediction to a dataframe
predictions_df = pd.DataFrame(predictions, columns=['Predicted_Risk_Level'])


## Save prediction into a nw schema

In [23]:
schema = r'analytics' # schema we just created
sql="select tables.table_name from information_schema.tables where (table_schema ='"+schema+"')order by 1;"
tbl_df = pd.read_sql(sql, db_conn, index_col=None)
tbl_df

Unnamed: 0,table_name


In [24]:
table_name = r'model1_validation_predict'

In [25]:
predictions_df.to_sql(table_name, con=db_conn, if_exists='replace', index=False, schema=schema, chunksize=1000, method='multi')


3570

In [26]:
tbl_df = pd.read_sql(sql, db_conn, index_col=None)
tbl_df

Unnamed: 0,table_name
0,model1_validation_predict


In [27]:
sql=r'SELECT * FROM ' + schema + '.' + table_name
clean_check = pd.read_sql(sql, db_conn, index_col=None)

In [28]:
clean_check.head(10)

Unnamed: 0,Predicted_Risk_Level
0,0
1,1
2,1
3,2
4,2
5,2
6,1
7,1
8,0
9,0


## Summary

In notebooks Week6_Lab_part1 and Week6_Lab_part2, we examine constructing a model using features from last week’s lab and selecting new features as well.

In the first notebook (Week6_Lab_part1), I created two models.

Before constructing the models, I modified my KMeans model. In last week’s lab, the KMeans clustering used the features I would later use for my random forest classification model to create clusters. However, this time I used all the columns in my dataset as features in the KMeans model so that the clusters can identify patterns across the entire dataset.

Once I obtained a risk level output from the KMeans, I proceeded to create my first random forest classification model. For this model, the input features were all the columns in my dataset except for the risk level column, which served as my target. My goal in using all the columns as features was to uncover another important feature for the model, beyond those identified last week.

After running the model, I found that GDP was the most important feature; all other columns/features were less significant.

In my second model, I selected the two features from last week along with the newly selected feature, GDP. The features used were:
`gb[['CO2_Emissions', 'Methane_Emissions', 'GDP']]`.

After running the model, I achieved 100% accuracy for model 2, just as with model 1. When comparing feature importance in model 2, the importance of my selected features increased.

In the second notebook, I evaluated model 2 in a manner similar to notebook 1 and generated predictions for my validation set. For the validation set, 3570 datasets were allocated. After generating predictions, we checked the accuracy using accuracy_score. With a value of 1.0, we know our model predicts 100% accurately.