## Creating the dataset and importing libraries

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'my_house.csv')
with open(data_file, 'w') as f:
  f.write('NumRooms,Alley,Price\n')
  f.write('NA,Pave,127500\n')
  f.write('2,NA,106000\n')
  f.write('4,NA,178100\n')
  f.write('NA,NA,140000\n')

## Reading the dataset :

In [None]:
dataset = pd.read_csv("/data/my_house.csv")
dataset.head()

Unnamed: 0,NumRooms,Alley,Price
0,,Pave,127500
1,2.0,,106000
2,4.0,,178100
3,,,140000


In [None]:
dataset["Alley"]

0    Pave
1     NaN
2     NaN
3     NaN
Name: Alley, dtype: object

In [None]:
dataset.loc[3]

NumRooms       NaN
Alley          NaN
Price       140000
Name: 3, dtype: object

In [None]:
dataset.sample(3)

Unnamed: 0,NumRooms,Alley,Price
3,,,140000
2,4.0,,178100
1,2.0,,106000


## Handling missing data :

In [None]:
inputs, outputs = dataset.iloc[:, 0:2], dataset.iloc[:, 2]
inputs = inputs.fillna(inputs.mean())
inputs

  inputs = inputs.fillna(inputs.mean())


Unnamed: 0,NumRooms,Alley
0,3.0,Pave
1,2.0,
2,4.0,
3,3.0,


In [None]:
outputs

0    127500
1    106000
2    178100
3    140000
Name: Price, dtype: int64

In [None]:
inputs = dataset.iloc[:, 0:2]
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [None]:
dataset.describe()

Unnamed: 0,NumRooms,Price
count,2.0,4.0
mean,3.0,137900.0
std,1.414214,30255.68817
min,2.0,106000.0
25%,2.5,122125.0
50%,3.0,133750.0
75%,3.5,149525.0
max,4.0,178100.0


In [None]:
inputs

Unnamed: 0,NumRooms,Alley
0,,Pave
1,2.0,
2,4.0,
3,,


In [None]:
inputs = pd.get_dummies(inputs, dummy_na=True)
inputs

Unnamed: 0,NumRooms,Alley_Pave,Alley_nan
0,,1,0
1,2.0,0,1
2,4.0,0,1
3,,0,1


## Conversion to the Tensor Format

In [None]:
import torch

inputs = inputs.iloc[:, 0:2].fillna(inputs.mean())
X, y = torch.tensor(inputs.values), torch.tensor(outputs.values)
X, y

(tensor([[3., 1.],
         [2., 0.],
         [4., 0.],
         [3., 0.]], dtype=torch.float64),
 tensor([127500, 106000, 178100, 140000]))

## Excercises :

In [21]:
diabetes = pd.read_csv("/diabetes.csv")
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [22]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               769 non-null    int64  
 1   Glucose                   769 non-null    int64  
 2   BloodPressure             769 non-null    int64  
 3   SkinThickness             769 non-null    int64  
 4   Insulin                   769 non-null    int64  
 5   BMI                       769 non-null    float64
 6   DiabetesPedigreeFunction  769 non-null    float64
 7   Age                       769 non-null    int64  
 8   Outcome                   769 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.2 KB


In [24]:
diabetes_in = diabetes.iloc[:, 0:8]
diabetes_out = diabetes.iloc[:, 8]
X_diabetes, y_diabetes = torch.tensor(diabetes_in.values), torch.tensor(diabetes_out.values)
X_diabetes, y_diabetes

(tensor([[  6.0000, 148.0000,  72.0000,  ...,  33.6000,   0.6270,  50.0000],
         [  1.0000,  85.0000,  66.0000,  ...,  26.6000,   0.3510,  31.0000],
         [  8.0000, 183.0000,  64.0000,  ...,  23.3000,   0.6720,  32.0000],
         ...,
         [  1.0000, 126.0000,  60.0000,  ...,  30.1000,   0.3490,  47.0000],
         [  1.0000,  93.0000,  70.0000,  ...,  30.4000,   0.3150,  23.0000],
         [  0.0000, 123.0000,  77.0000,  ...,  36.3000,   0.2520,  55.0000]],
        dtype=torch.float64),
 tensor([1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1,
         1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
         1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
         1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
         0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,
         1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       