# Project Checkpoint 4 - ETL Process 
## Group 3: Jiayi Fan, Jane Lee, Han Wang, Dinghao Xu, Jinyi Zhang, Yue Zhang

### Importing packages & data

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
if not os.path.exists("train.csv"):
    # Download dataset from Kaggle
    os.environ['KAGGLE_USERNAME'] = "jif087"
    os.environ['KAGGLE_KEY'] = "7df2494915e25e7323e0c67a9a427084"
    !kaggle datasets download -d clkmuhammed/creditscoreclassification

    # Upzip the file
    !unzip "creditscoreclassification.zip"

In [3]:
df = pd.read_csv('train.csv')
df = df.rename(columns=str.lower)
df

Unnamed: 0,id,customer_id,month,name,age,ssn,occupation,annual_income,monthly_inhand_salary,num_bank_accounts,...,credit_mix,outstanding_debt,credit_utilization_ratio,credit_history_age,payment_of_min_amount,total_emi_per_month,amount_invested_monthly,payment_behaviour,monthly_balance,credit_score
0,5634,3392,1,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,26.822620,265.0,No,49.574949,21.465380,High_spent_Small_value_payments,312.494089,Good
1,5635,3392,2,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.944960,266.0,No,49.574949,21.465380,Low_spent_Large_value_payments,284.629162,Good
2,5636,3392,3,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,28.609352,267.0,No,49.574949,21.465380,Low_spent_Medium_value_payments,331.209863,Good
3,5637,3392,4,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,31.377862,268.0,No,49.574949,21.465380,Low_spent_Small_value_payments,223.451310,Good
4,5638,3392,5,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12,1824.843333,3.0,...,Good,809.98,24.797347,269.0,No,49.574949,21.465380,High_spent_Medium_value_payments,341.489231,Good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,155625,37932,4,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,34.663572,378.0,No,35.104023,24.028477,High_spent_Large_value_payments,479.866228,Poor
99996,155626,37932,5,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,40.565631,379.0,No,35.104023,24.028477,High_spent_Medium_value_payments,496.651610,Poor
99997,155627,37932,6,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,41.255522,380.0,No,35.104023,24.028477,High_spent_Large_value_payments,516.809083,Poor
99998,155628,37932,7,Nicks,25.0,78735990.0,Mechanic,39628.99,3359.415833,4.0,...,Good,502.38,33.638208,381.0,No,35.104023,24.028477,Low_spent_Large_value_payments,319.164979,Standard


### Establish a connection & Create our tables

In [4]:
# Pass the connection string to a variable, conn_url
conn_url = 'postgresql://postgres:123@localhost/project'

# Create an engine that connects to PostgreSQL server
engine = create_engine(conn_url)

# Establish a connection
connection = engine.connect()

In [5]:
# Pass the SQL statement to delete everything
stmt = """
    DROP SCHEMA public CASCADE;
    CREATE SCHEMA public;
"""

# Execute the statements
connection.execute(stmt)

<sqlalchemy.engine.result.ResultProxy at 0x7fe661bc1f50>

In [6]:
# Pass the SQL statements that create all tables
stmt = """
CREATE TABLE customers (
	customer_id 				int,
	name 						varchar(100) NOT NULL,
	age							int NOT NULL,
	SSN 						varchar(30) NOT NULL UNIQUE,
	occupation 					varchar(100) NOT NULL,
	annual_income 				numeric(10,2) NOT NULL,
	PRIMARY KEY (customer_id)
);

CREATE TABLE customer_month (
	id 							int,
	customer_id 				int NOT NULL,
	month 						int NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (customer_id) REFERENCES customers (customer_id)
);

CREATE TABLE loans (
	loan_id 					int,
	type_of_loan				varchar(60) NOT NULL,
	PRIMARY KEY (loan_id)
);

CREATE TABLE customer_payment_behaviour (
	behaviour_id					int,
	payment_behaviour			varchar(100) NOT NULL,
	PRIMARY KEY (behaviour_id)
);

CREATE TABLE customer_monthly_balance (
	id							int,
	monthly_balance				numeric(10,2) NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (id) REFERENCES customer_month (id)
);

CREATE TABLE customer_delay(
	id 							int,
	delay_from_due_date 		int NOT NULL,
	num_of_delayed_payment 		int NOT NULL,
	payment_of_min_amount 		varchar NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (id) REFERENCES customer_month (id)
);


CREATE TABLE customer_loan_amount (
	id 							int,
	outstanding_debt 			numeric(10,2) NOT NULL,
	total_emi_per_month 		numeric(10,2) NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (id) REFERENCES customer_month (id)
);

CREATE TABLE  credit_score (
	credit_score_id 			int,
	credit_score 				varchar(10) NOT NULL,
PRIMARY KEY (credit_score_id)
);

CREATE TABLE credit_mix (
	credit_mix_id 				int,
	credit_mix    				varchar(10) NOT NULL,
	PRIMARY KEY (credit_mix_id)
);

CREATE TABLE customer_monthly_income (
	id                    		int,
	monthly_inhand_salary 		numeric(10,2) NOT NULL,
	PRIMARY KEY (id),
 	FOREIGN KEY (id) REFERENCES customer_month (id)
);

CREATE TABLE customer_monthly_spent (
	id   						int,
	total_emi_per_month   		numeric(10,2) NOT NULL,
	amount_invested_monthly 	numeric(10,2) NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (id) REFERENCES customer_month (id)
);

CREATE TABLE customer_credit (
	id   						int,
	credit_score_id   			int NOT NULL,
	num_credit_inquiries   		int NOT NULL,
	credit_history_age   		int NOT NULL,
	credit_utilization_ratio   	numeric(5,2) NOT NULL,
	changed_credit_limit   		numeric(5,2) NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (credit_score_id) REFERENCES credit_score (credit_score_id),
	FOREIGN KEY (id) REFERENCES customer_month (id)
);

CREATE TABLE customer_loan (
	customer_id					int,
	loan_id						int,
	num_of_loan_type					int NOT NULL,
	PRIMARY KEY (customer_id, loan_id),
	FOREIGN KEY (loan_id) REFERENCES loans (loan_id),
	FOREIGN KEY (customer_id) REFERENCES customers (customer_id)
);

CREATE TABLE customer_annual_credit (
	customer_id					int,
	credit_mix_id				int NOT NULL,
	PRIMARY KEY (customer_id),
	FOREIGN KEY (credit_mix_id) REFERENCES credit_mix (credit_mix_id),
	FOREIGN KEY (customer_id) REFERENCES customers (customer_id)
);

CREATE TABLE customer_bank_information (
	id							int,
	num_bank_accounts			int NOT NULL,
	num_credit_card				int NOT NULL,
	interest_rate				int NOT NULL,
	num_of_loan					int NOT NULL,
	behaviour_id					int NOT NULL,
	PRIMARY KEY (id),
	FOREIGN KEY (behaviour_id) REFERENCES customer_payment_behaviour (behaviour_id),
	FOREIGN KEY (id) REFERENCES customer_month (id)
);
"""   

# Execute the statement to create tables
connection.execute(stmt)

<sqlalchemy.engine.result.ResultProxy at 0x7fe661bd1e50>

### Inserting data into our database

#### Table 1: customers

In [7]:
df2 = df.drop_duplicates(subset=['customer_id'])
customers = df2[['customer_id', 'name', 'age', 'ssn', 'occupation', 'annual_income']]
customers.head()

Unnamed: 0,customer_id,name,age,ssn,occupation,annual_income
0,3392,Aaron Maashoh,23.0,821000265.0,Scientist,19114.12
8,8625,Rick Rothackerj,28.0,4075839.0,Teacher,34847.84
16,11708,Langep,34.0,486853974.0,Engineer,143162.64
24,47249,Jasond,54.0,72316145.0,Entrepreneur,30689.89
32,7387,Deepaa,21.0,615067821.0,Developer,35547.71


In [8]:
for row in customers.itertuples():
    connection.execute("""
    INSERT INTO customers (customer_id, name, age, ssn, occupation, annual_income)
    VALUES (%s, %s, %s, %s, %s, %s)
    """,
    (row.customer_id, row.name, row.age, row.ssn, row.occupation, row.annual_income))

#### Table 2: customer month

In [9]:
customer_month = df[['id', 'customer_id', 'month']].copy()
customer_month.head()

Unnamed: 0,id,customer_id,month
0,5634,3392,1
1,5635,3392,2
2,5636,3392,3
3,5637,3392,4
4,5638,3392,5


In [10]:
for row in customer_month.itertuples():
    connection.execute("""
    INSERT INTO customer_month (id, customer_id, month)
    VALUES (%s, %s, %s)
    """,
    (row.id, row.customer_id, row.month))

#### Table 3: loans

In [11]:
tloans_df = df['type_of_loan'].str.replace(r'and ', '')
tloans_df = pd.DataFrame(pd.unique(tloans_df.str.split(', ', expand=True).stack()), 
                     columns=['type_of_loan'])
tloans_df.insert(0, 'loan_id', range(1, 1 + len(tloans_df)))
loans_df = tloans_df[['loan_id','type_of_loan']]
loans_df = loans_df.drop_duplicates()
loans_df

Unnamed: 0,loan_id,type_of_loan
0,1,Auto Loan
1,2,Credit-Builder Loan
2,3,Personal Loan
3,4,Home Equity Loan
4,5,Not Specified
5,6,No Data
6,7,Mortgage Loan
7,8,Student Loan
8,9,Debt Consolidation Loan
9,10,Payday Loan


In [12]:
loans_df.to_sql(name='loans', con=engine, if_exists='append', index=False)

#### Table 4: customer payment behavior

In [13]:
tpayment_behaviour_df = pd.DataFrame(df.payment_behaviour.unique(), columns=['payment_behaviour'])
tpayment_behaviour_df.insert(0, 'behaviour_id', range(1, 1 + len(tpayment_behaviour_df)))
customer_payment_behaviour_df = tpayment_behaviour_df[['behaviour_id','payment_behaviour']]
customer_payment_behaviour_df = customer_payment_behaviour_df.drop_duplicates()
customer_payment_behaviour_df

Unnamed: 0,behaviour_id,payment_behaviour
0,1,High_spent_Small_value_payments
1,2,Low_spent_Large_value_payments
2,3,Low_spent_Medium_value_payments
3,4,Low_spent_Small_value_payments
4,5,High_spent_Medium_value_payments
5,6,High_spent_Large_value_payments


In [14]:
customer_payment_behaviour_df.to_sql(name='customer_payment_behaviour', con=engine, if_exists='append', index=False)

#### Table 5: customer monthly balance

In [15]:
customer_monthly_balance_df = df[['id','monthly_balance']]
customer_monthly_balance_df = customer_monthly_balance_df.drop_duplicates()
customer_monthly_balance_df.head()

Unnamed: 0,id,monthly_balance
0,5634,312.494089
1,5635,284.629162
2,5636,331.209863
3,5637,223.45131
4,5638,341.489231


In [16]:
customer_monthly_balance_df.to_sql(name='customer_monthly_balance', con=engine, if_exists='append', index=False)

#### Table 6: customer delay 

In [17]:
customer_delay = df[['id','delay_from_due_date','num_of_delayed_payment','payment_of_min_amount']]
customer_delay.head()

Unnamed: 0,id,delay_from_due_date,num_of_delayed_payment,payment_of_min_amount
0,5634,3.0,7.0,No
1,5635,3.0,4.0,No
2,5636,3.0,7.0,No
3,5637,5.0,4.0,No
4,5638,6.0,4.0,No


In [18]:
customer_delay.to_sql(name='customer_delay', con=engine, if_exists='append', index=False)

#### Table 7: customer loan amount 

In [19]:
customer_loan_amount = df[['id','outstanding_debt','total_emi_per_month']]
customer_loan_amount.head()

Unnamed: 0,id,outstanding_debt,total_emi_per_month
0,5634,809.98,49.574949
1,5635,809.98,49.574949
2,5636,809.98,49.574949
3,5637,809.98,49.574949
4,5638,809.98,49.574949


In [20]:
customer_loan_amount.to_sql(name='customer_loan_amount', con=engine, if_exists='append', index=False)

#### Table 8: credit score

In [21]:
Credit_Score = pd.DataFrame(df.credit_score.unique(), columns=['credit_score'])
Credit_Score.insert(0, 'credit_score_id', range(1, 1 + len(Credit_Score)))
Credit_Score

Unnamed: 0,credit_score_id,credit_score
0,1,Good
1,2,Standard
2,3,Poor


In [22]:
Credit_Score.to_sql(name='credit_score', con=engine, if_exists='append', index=False)

#### Table 9: credit mix

In [23]:
Credit_Mix = pd.DataFrame(df.credit_mix.unique(), columns=['credit_mix'])
Credit_Mix.insert(0, 'credit_mix_id', range(1, 1 + len(Credit_Mix)))
Credit_Mix

Unnamed: 0,credit_mix_id,credit_mix
0,1,Good
1,2,Standard
2,3,Bad


In [24]:
Credit_Mix.to_sql(name='credit_mix', con=engine, if_exists='append', index=False)

#### Table 10: customer monthly income

In [25]:
customer_monthly_income=df[['id','monthly_inhand_salary']]
customer_monthly_income.head()

Unnamed: 0,id,monthly_inhand_salary
0,5634,1824.843333
1,5635,1824.843333
2,5636,1824.843333
3,5637,1824.843333
4,5638,1824.843333


In [26]:
customer_monthly_income.to_sql(name='customer_monthly_income', con=engine, if_exists='append', index=False)

#### Table 11: customer monthly spent

In [27]:
customer_monthly_spent_df = df[['id', 'total_emi_per_month', 'amount_invested_monthly']]
customer_monthly_spent_df

Unnamed: 0,id,total_emi_per_month,amount_invested_monthly
0,5634,49.574949,21.465380
1,5635,49.574949,21.465380
2,5636,49.574949,21.465380
3,5637,49.574949,21.465380
4,5638,49.574949,21.465380
...,...,...,...
99995,155625,35.104023,24.028477
99996,155626,35.104023,24.028477
99997,155627,35.104023,24.028477
99998,155628,35.104023,24.028477


In [28]:
customer_monthly_spent_df.to_sql(name='customer_monthly_spent', con=engine, if_exists='append', index=False)

#### Table 12: customer credit

In [29]:
customer_credit_df = df[['id', 'num_credit_inquiries', 'credit_history_age', 
                         'credit_score', 'credit_utilization_ratio', 'changed_credit_limit']]

credict_score_dict = dict(zip(Credit_Score['credit_score'], Credit_Score['credit_score_id']))
customer_credit_df['credit_score'] = customer_credit_df['credit_score'].map(credict_score_dict)
customer_credit_df = customer_credit_df.rename(columns={'credit_score': 'credit_score_id'})
customer_credit_df

Unnamed: 0,id,num_credit_inquiries,credit_history_age,credit_score_id,credit_utilization_ratio,changed_credit_limit
0,5634,4.0,265.0,1,26.822620,11.27
1,5635,4.0,266.0,1,31.944960,11.27
2,5636,4.0,267.0,1,28.609352,11.27
3,5637,4.0,268.0,1,31.377862,6.27
4,5638,4.0,269.0,1,24.797347,11.27
...,...,...,...,...,...,...
99995,155625,3.0,378.0,3,34.663572,11.50
99996,155626,3.0,379.0,3,40.565631,11.50
99997,155627,3.0,380.0,3,41.255522,11.50
99998,155628,3.0,381.0,2,33.638208,11.50


In [30]:
customer_credit_df.to_sql(name='customer_credit', con=engine, if_exists='append', index=False)

#### Table 13: customer loan

In [31]:
customer_loan_df = df[['customer_id', 'type_of_loan']].drop_duplicates()
customer_loan_df['type_of_loan'] = df['type_of_loan'].str.replace(r'and ', '')
customer_loan_df = customer_loan_df.assign(
    type_of_loan=customer_loan_df.type_of_loan.str.split(', ')).explode('type_of_loan')
customer_loan_df['num_of_loan_type'] = 1
customer_loan_df = customer_loan_df.groupby(['customer_id', 'type_of_loan']).sum().reset_index()

loans_dict = dict(zip(loans_df['type_of_loan'], loans_df['loan_id']))
customer_loan_df['type_of_loan'] = customer_loan_df['type_of_loan'].map(loans_dict)
customer_loan_df = customer_loan_df.rename(columns={'type_of_loan': 'loan_id'})
customer_loan_df

Unnamed: 0,customer_id,loan_id,num_of_loan_type
0,1006,2,1
1,1006,10,1
2,1007,4,1
3,1007,7,1
4,1007,8,1
...,...,...,...
36653,50992,7,1
36654,50992,10,1
36655,50992,8,1
36656,50996,6,1


In [32]:
customer_loan_df.to_sql(name='customer_loan', con=engine, if_exists='append', index=False)

#### Table 14: customer annual credit

In [33]:
customer_annual_credit_df = df[['customer_id', 'credit_mix']]
credict_mix_dict = dict(zip(Credit_Mix['credit_mix'], Credit_Mix['credit_mix_id']))
customer_annual_credit_df['credit_mix'] = customer_annual_credit_df['credit_mix'].map(credict_mix_dict)
customer_annual_credit_df = customer_annual_credit_df.rename(columns={'credit_mix': 'credit_mix_id'})
customer_annual_credit_df = customer_annual_credit_df.drop_duplicates()
customer_annual_credit_df

Unnamed: 0,customer_id,credit_mix_id
0,3392,1
8,8625,1
16,11708,1
24,47249,1
32,7387,2
...,...,...
99960,14124,1
99968,3862,1
99976,44897,1
99984,34304,3


In [34]:
customer_annual_credit_df.to_sql(name='customer_annual_credit', con=engine, if_exists='append', index=False)

#### Table 15: customer bank information

In [35]:
customer_bank_info_df = df[['id', 'num_bank_accounts', 'num_credit_card',
                            'interest_rate', 'num_of_loan', 'payment_behaviour']]
behaviour_dict = dict(zip(customer_payment_behaviour_df['payment_behaviour'], 
                          customer_payment_behaviour_df['behaviour_id']))
customer_bank_info_df['payment_behaviour'] = customer_bank_info_df['payment_behaviour'].map(behaviour_dict)
customer_bank_info_df = customer_bank_info_df.rename(columns={'payment_behaviour': 'behaviour_id'})
customer_bank_info_df

Unnamed: 0,id,num_bank_accounts,num_credit_card,interest_rate,num_of_loan,behaviour_id
0,5634,3.0,4.0,3.0,4.0,1
1,5635,3.0,4.0,3.0,4.0,2
2,5636,3.0,4.0,3.0,4.0,3
3,5637,3.0,4.0,3.0,4.0,4
4,5638,3.0,4.0,3.0,4.0,5
...,...,...,...,...,...,...
99995,155625,4.0,6.0,7.0,2.0,6
99996,155626,4.0,6.0,7.0,2.0,5
99997,155627,4.0,6.0,7.0,2.0,6
99998,155628,4.0,6.0,7.0,2.0,2


In [36]:
customer_bank_info_df.to_sql(name='customer_bank_information', con=engine, if_exists='append', index=False)

### Querying into the database

In [37]:
customers_df = pd.DataFrame(connection.execute("""
    SELECT * FROM customers
    """).fetchall())
customers_df.head(5)

Unnamed: 0,0,1,2,3,4,5
0,3392,Aaron Maashoh,23,821000265.0,Scientist,19114.12
1,8625,Rick Rothackerj,28,4075839.0,Teacher,34847.84
2,11708,Langep,34,486853974.0,Engineer,143162.64
3,47249,Jasond,54,72316145.0,Entrepreneur,30689.89
4,7387,Deepaa,21,615067821.0,Developer,35547.71
