In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
from sklearn.linear_model import LogisticRegression, LinearRegression

#1.Loading file

In [2]:
hearts_df = pd.read_csv("heart.csv")

#2. Checking types and missing data



In [3]:
hearts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trtbps    303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalachh  303 non-null    int64  
 8   exng      303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slp       303 non-null    int64  
 11  caa       303 non-null    int64  
 12  thall     303 non-null    int64  
 13  output    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


Data is complete and all columns are numeric.


#3. Fixing data
From column *cp*, which is an encoding form 0 to 3 of different types of chest pain I created 4 binary features for each kind of pain. I do so beacause they all are not corelated. I obtained *cp0*, *cp1*, *cp2*, *cp3*. Analogically I created *restecg0*, *restecg1*, *restecg2* form resting electrocardiographic results (*restecg*).

In [4]:
for i in range(4):
  hearts_df[f"cp{i}"] = (hearts_df["cp"] == i) * 1
for i in range(3):
  hearts_df[f"restecg{i}"] = (hearts_df["restecg"] == i) * 1

#4. Extracting features and target
I droped columns *cp* i *restecg* and created targets from *output* column and features from the rest.

In [5]:
y_heart = hearts_df["output"]
x_heart = hearts_df.drop(columns=["cp", "restecg", "output"])

#5. Analizing data



In [6]:
x_heart.describe()

Unnamed: 0,age,sex,trtbps,chol,fbs,thalachh,exng,oldpeak,slp,caa,thall,cp0,cp1,cp2,cp3,restecg0,restecg1,restecg2
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,131.623762,246.264026,0.148515,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.471947,0.165017,0.287129,0.075908,0.485149,0.50165,0.013201
std,9.082101,0.466011,17.538143,51.830751,0.356198,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.500038,0.371809,0.453171,0.265288,0.500606,0.500824,0.114325
min,29.0,0.0,94.0,126.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,120.0,211.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,55.0,1.0,130.0,240.0,0.0,153.0,0.0,0.8,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,61.0,1.0,140.0,274.5,0.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
max,77.0,1.0,200.0,564.0,1.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


The data was standarized beacause columns are in very different range.

In [7]:
for column in x_heart.columns:
  x_heart[column] = (x_heart[column] - x_heart[column].mean()) / x_heart[column].std()

#6. Spliting for training and testing data.


In [8]:
X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(x_heart, y_heart, test_size=0.33, random_state=446519)

#7. Classifires
I used radom forest, support vector machine and logistic regression.

In [9]:
rf_h = RandomForestClassifier(n_estimators = 300, max_depth=5, random_state=446519)
rf_h.fit(X_train_heart, y_train_heart)

RandomForestClassifier(max_depth=5, n_estimators=300, random_state=446519)

In [10]:
svc_h = SVC(random_state=446519)
svc_h.fit(X_train_heart, y_train_heart)

SVC(random_state=446519)

In [11]:
lg_h = LogisticRegression(random_state=446519)
lg_h.fit(X_train_heart, y_train_heart)

LogisticRegression(random_state=446519)

#8. Scores

In [12]:
print(f"Random forest score: {rf_h.score(X_test_heart, y_test_heart)}.")
print(f"Support Vector Machine score: {svc_h.score(X_test_heart, y_test_heart)}.")
print(f"Logistic Regression score: {lg_h.score(X_test_heart, y_test_heart)}.")

Random forest score: 0.84.
Support Vector Machine score: 0.82.
Logistic Regression score: 0.83.


Apparently all used methods give simillar results. Just ahead of all methods was Random forest giving 84% of correct classification. 

#9. Loading data

In [13]:
waste_df = pd.read_csv("public_data_waste_fee.csv")

#10. Checking types and missing data

In [14]:
waste_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4341 entries, 0 to 4340
Data columns (total 39 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   region      4341 non-null   object 
 1   province    4341 non-null   object 
 2   name        4335 non-null   object 
 3   tc          4341 non-null   float64
 4   cres        4289 non-null   float64
 5   csor        4274 non-null   float64
 6   istat       4341 non-null   int64  
 7   area        4335 non-null   float64
 8   pop         4341 non-null   int64  
 9   alt         4335 non-null   float64
 10  isle        4335 non-null   float64
 11  sea         4335 non-null   float64
 12  pden        4335 non-null   float64
 13  wden        4335 non-null   float64
 14  urb         4335 non-null   float64
 15  fee         4341 non-null   object 
 16  d_fee       4341 non-null   int64  
 17  sample      4341 non-null   int64  
 18  organic     3829 non-null   float64
 19  paper       4316 non-null  

There are plenty of missing data.

In [27]:
waste_df = waste_df.dropna()
waste_df["region"].count()

2017

After removing all incomplete records 2017 hes left which is enough for analizes because it is much more than number of columns (39).

#11. Fixing data
Column *fee* is a binary information represented by string. To make it useable the column was encoded by 0 and 1 representing *PAYT* and *STANDARD*.
Column *geo* discribes a one of three regions. Three new features were created and each is a binary information if the region was *south*, *center* or *north*. The features are named *geo_south*, *geo_center* nad *geo_north*.

In [16]:
waste_df["fee"] = (waste_df["fee"] == 'PAYT') * 1
waste_df["geo_south"] = (waste_df["geo"] == 1) * 1
waste_df["geo_center"] = (waste_df["geo"] == 2) * 1
waste_df["geo_north"] = (waste_df["geo"] == 3) * 1

#12. Extracting features and target

Columns *region*, *province* and *name* are string data. They refer to localization and family name. Because this information cannot be properly repserented as numbers these columns were removed. 

In [17]:
x_waste = waste_df.drop(columns=["region", "province", "name", "finance", "geo"])
y_waste = waste_df["finance"]

#13. Analizing data


In [18]:
x_waste.describe()

Unnamed: 0,tc,cres,csor,istat,area,pop,alt,isle,sea,pden,...,sor,roads,s_wteregio,s_landfill,gdp,proads,wage,geo_south,geo_center,geo_north
count,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,...,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0,2017.0
mean,146.704328,46.128929,50.44651,40276.702033,37.8129,7778.185424,270.443728,0.001983,0.130887,458.064041,...,71.82295,86.14818,22.547909,17.143479,7.084848,4.48607,9.545002,0.165097,0.13882,0.696083
std,67.302738,33.84036,27.519303,28785.901676,51.003452,11327.158279,259.837548,0.044499,0.337361,750.805151,...,14.313797,137.178136,14.792088,17.037246,0.483713,1.006831,0.242808,0.37136,0.345844,0.460061
min,46.490002,4.27,3.39,3001.0,0.67,80.0,1.0,0.0,0.0,5.531726,...,14.15,1.0,0.0,3.602713,6.035401,0.187463,8.574547,0.0,0.0,0.0
25%,104.779999,24.59,32.68,17042.0,9.62,2288.0,74.0,0.0,0.0,90.470871,...,66.24,25.0,9.898176,4.55143,6.732576,3.881701,9.443672,0.0,0.0,0.0
50%,129.759995,35.52,48.400002,28090.0,21.129999,4486.0,205.0,0.0,0.0,211.998154,...,75.44,50.0,24.467649,11.296785,6.993063,4.571973,9.599592,0.0,0.0,1.0
75%,171.520004,56.009998,62.580002,61049.0,44.93,8642.0,371.0,0.0,0.0,548.026733,...,81.55,97.0,38.501492,23.119114,7.362284,5.179881,9.699822,0.0,0.0,1.0
max,809.169983,295.200012,273.929993,110005.0,554.97998,180817.0,1568.0,1.0,1.0,12122.826172,...,94.02,2625.0,65.122093,92.53186,9.576093,8.664233,10.285608,1.0,1.0,1.0


The data was standarized beacause columns are in very different range.

In [19]:
for column in x_waste.columns:
  x_waste[column] = (x_waste[column] - x_waste[column].mean()) / x_waste[column].std()

#14. Spliting for training and testing data.


In [20]:
X_train_waste, X_test_waste, y_train_waste, y_test_waste = train_test_split(x_waste, y_waste, test_size=0.33, random_state=446519)

#15. Regressors
For regression I used random forest, support vector machine and regression.

In [21]:
rf_w = RandomForestRegressor(n_estimators = 300, max_depth=5, random_state=446519)
rf_w.fit(X_train_waste, y_train_waste)

RandomForestRegressor(max_depth=5, n_estimators=300, random_state=446519)

In [22]:
svr_w = SVR()
svr_w.fit(X_train_waste, y_train_waste)

SVR()

In [23]:
lr_w = LinearRegression()
lr_w.fit(X_train_waste, y_train_waste)

LinearRegression()

#16. Scores
Given scores are **R<sup>2</sup>** which is given by formula:

```
score = (1 - ((y_true - y_pred)** 2).sum() / ((y_true - y_true.mean()) ** 2).sum())
```

 

In [24]:
print(f"Random forest score: {rf_w.score(X_test_waste, y_test_waste)}.")
print(f"Support Vector Machine score: {svr_w.score(X_test_waste, y_test_waste)}.")
print(f"Logistic Regression score: {lr_w.score(X_test_waste, y_test_waste)}.")

Random forest score: 0.997414911605436.
Support Vector Machine score: 0.920199065116099.
Logistic Regression score: 1.0.
