In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LinearRegression

In [2]:
# Создаём сэмпл
n_samples = 1000

age_owner = np.random.choice(90, n_samples) + 21
length = np.random.choice(120, n_samples) + 15
width = np.random.choice(80, n_samples) + 10

price = length * width * 100 + 126

data = pd.DataFrame({'age_owner': age_owner, 'length': length, 'width': width, 'price': price})
data.head(5)

Unnamed: 0,age_owner,length,width,price
0,100,104,78,811326
1,48,94,75,705126
2,70,118,27,318726
3,50,62,57,353526
4,35,61,70,427126


In [3]:
from sklearn.metrics import mean_absolute_error

X = data[['age_owner', 'length', 'width']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['age_owner', 'length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [  50.47292293 5033.8565679  7515.17441307]
Bias: -380968.4825958383
Error: 60649.41560482876


In [4]:
y.median()

323326.0

In [5]:
X = data[['length', 'width']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['length', 'width']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [5034.39620787 7513.9469404 ]
Bias: -377645.2773167181
Error: 60672.66958369974


In [6]:
# Создаем новый признак
data['mult'] = data['length'] * data['width']
data.head(5)

Unnamed: 0,age_owner,length,width,price,mult
0,100,104,78,811326,8112
1,48,94,75,705126,7050
2,70,118,27,318726,3186
3,50,62,57,353526,3534
4,35,61,70,427126,4270


In [7]:
X = data[['mult']]
y = data['price']
reg = LinearRegression().fit(X, y)
print('Weights: {}'.format(reg.coef_))
print('Bias: {}'.format(reg.intercept_))

pred_values = reg.predict(data[['mult']])
print('Error: {}'.format(mean_absolute_error(pred_values, y)))

Weights: [100.]
Bias: 126.00000000034925
Error: 2.1549203665927053e-10


In [8]:
# ---------------------------------------------------------------------------------
#
# Homework
#
#

In [9]:
#
# At first let's create func to create model for simple use data 
#

def create_model(X_train, y_train, X_test, y_test, test_data, target_columns, verbose=False):
    model = LinearRegression().fit(X_train, y_train)
    print(f'Used columns: {",".join(target_columns)}')
    print(f'     Weights: {model.coef_}')
    pred_values = model.predict(X_test)
    print(f'     Error|Bias: {round(mean_absolute_error(pred_values, y_test) * 100, 3)}% | {model.intercept_}\n')

    if not verbose:
        return 
    
    result = test_data[['age', 'temperature', 'has_covid']]
    result['preds'] = pred_values
    
    return result

In [10]:
#
# Create some actual data of some hospital :)
#

nm_samples = 1000

age = np.random.choice(70, nm_samples) + 18
height = np.random.choice(100, nm_samples) + 100
weight = np.random.choice(100, nm_samples) + 40
sex = np.random.choice(['male', 'female'], nm_samples)
city_region = np.random.choice(['north', 'south', 'west', 'east', 'center'], nm_samples) # lets combine user addresses like 'Lenin street, 6 - 12' into city_region
healer_name = np.random.choice(['Ivanov', 'Yakovleva', 'Sidorov', 'Petrova'], nm_samples)
disabilities = np.random.choice(['none', 'no hand', 'no eye', 'no leg'], nm_samples, p=[0.85, 0.05, 0.05, 0.05])

# Our target (lets set covid only at high temperature)
has_covid = np.random.choice(['positive', 'negative'], nm_samples)

temperature = [37.5 + np.random.choice(35)/10 if x == 'positive' else 36.5 + np.random.choice(12)/10 for x in has_covid]

# Let's take that 80% of covid people has 'breath_problem'
breath_problem = [np.random.choice(['positive', 'negative'], p=[0.8, 0.2]) if x == 'positive' else x for x in has_covid]

# Add some noise columns
children = np.random.choice(4, nm_samples, p=[0.79, 0.1, 0.1, 0.01])
country = np.random.choice(['Russia', 'USA', 'Europe', 'Asia', 'Africa'], nm_samples)
work = np.random.choice(['LLC', 'Gov', 'None'], nm_samples)


In [11]:
orig_data = pd.DataFrame({'age': age, 'height': height, 'weight': weight, 
                          'temperature': temperature, 'breath_problem': breath_problem, 
                          'children': children, 'country': country, 'work': work,
                          'sex': sex, 'city_region': city_region, 'healer_name': healer_name, 'disabilities': disabilities, 
                          'has_covid': has_covid})
orig_data.info()
data = orig_data[:]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1000 non-null   int64  
 1   height          1000 non-null   int64  
 2   weight          1000 non-null   int64  
 3   temperature     1000 non-null   float64
 4   breath_problem  1000 non-null   object 
 5   children        1000 non-null   int64  
 6   country         1000 non-null   object 
 7   work            1000 non-null   object 
 8   sex             1000 non-null   object 
 9   city_region     1000 non-null   object 
 10  healer_name     1000 non-null   object 
 11  disabilities    1000 non-null   object 
 12  has_covid       1000 non-null   object 
dtypes: float64(1), int64(4), object(8)
memory usage: 101.7+ KB


In [12]:
orig_data.head()

Unnamed: 0,age,height,weight,temperature,breath_problem,children,country,work,sex,city_region,healer_name,disabilities,has_covid
0,62,181,70,36.5,negative,1,Asia,Gov,male,center,Yakovleva,none,negative
1,77,102,72,40.9,negative,0,Europe,Gov,male,east,Yakovleva,none,positive
2,71,173,59,37.5,negative,0,USA,,female,south,Yakovleva,none,negative
3,33,141,45,37.4,negative,0,Asia,LLC,female,west,Ivanov,none,negative
4,20,141,83,39.5,positive,0,Asia,Gov,male,east,Petrova,none,positive


In [13]:
# At first convert to binary data some columns like sex

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
ohe = OneHotEncoder()

ft = ohe.fit_transform(orig_data[['healer_name', 'city_region']])

data['breath_problem'] = le.fit_transform(orig_data['breath_problem'])
data['sex'] = le.fit_transform(orig_data['sex'])
data['has_covid'] = le.fit_transform(orig_data['has_covid'])
data['disabilities'] = le.fit_transform(orig_data['disabilities'])
data['country'] = le.fit_transform(orig_data['country'])
data['work'] = le.fit_transform(orig_data['work'])

# TODO - I didn't understand how to use inverse transform here yet..



ohe_data = pd.get_dummies(orig_data[['healer_name', 'city_region']])

data = pd.concat([data, ohe_data], axis=1)
del(data['city_region'])
del(data['healer_name'])

data.head()

Unnamed: 0,age,height,weight,temperature,breath_problem,children,country,work,sex,disabilities,has_covid,healer_name_Ivanov,healer_name_Petrova,healer_name_Sidorov,healer_name_Yakovleva,city_region_center,city_region_east,city_region_north,city_region_south,city_region_west
0,62,181,70,36.5,0,1,1,0,1,3,0,0,0,0,1,1,0,0,0,0
1,77,102,72,40.9,0,0,2,0,1,3,1,0,0,0,1,0,1,0,0,0
2,71,173,59,37.5,0,0,4,2,0,3,0,0,0,0,1,0,0,0,1,0
3,33,141,45,37.4,0,0,1,1,0,3,0,1,0,0,0,0,0,0,0,1
4,20,141,83,39.5,1,0,1,0,1,3,1,0,1,0,0,0,1,0,0,0


In [14]:
# lets see some linears of all data to analyze all of it

#sns.pairplot(data[['age', 'height', 'weight', 'temperature', 'sex', 'disabilities', 'has_covid']], 
#             hue='has_covid').add_legend();

In [15]:
data.columns

Index(['age', 'height', 'weight', 'temperature', 'breath_problem', 'children',
       'country', 'work', 'sex', 'disabilities', 'has_covid',
       'healer_name_Ivanov', 'healer_name_Petrova', 'healer_name_Sidorov',
       'healer_name_Yakovleva', 'city_region_center', 'city_region_east',
       'city_region_north', 'city_region_south', 'city_region_west'],
      dtype='object')

### 1. Clear, formatted data test (ideal case):

In [16]:
#
#
# My target is predict covid and some columns are less important for me, like healer_name or city_region
#

def make_model_test(prev_data):
    nm_data = len(prev_data) - 10

    train_data = prev_data[:nm_data]
    test_data = prev_data[nm_data:]

    for train_columns in [
        ['temperature'],
        ['temperature', 'age'],
        ['temperature', 'age', 'weight', 'height', 'disabilities'],
        # add 'breath_problem'
        ['temperature', 'breath_problem'], # best model cause 
        ['temperature', 'age', 'breath_problem'],
        ['temperature', 'age', 'weight', 'height', 'breath_problem'],
        # all columns wihtout breath_problem
        ['age', 'height', 'weight', 'temperature', 'breath_problem', 'children',
       'country', 'work', 'sex', 'disabilities', 
       'healer_name_Ivanov', 'healer_name_Petrova', 'healer_name_Sidorov',
       'healer_name_Yakovleva', 'city_region_center', 'city_region_east',
       'city_region_north', 'city_region_south', 'city_region_west']
    ]:
        target_column = 'has_covid'

        X_train = train_data[train_columns]
        y_train = train_data[target_column]
        X_test = test_data[train_columns]
        y_test = test_data[target_column]

        create_model(X_train, y_train, X_test, y_test, test_data, train_columns)

        
make_model_test(data[:])

Used columns: temperature
     Weights: [0.31103599]
     Error|Bias: 23.562% | -11.358980682081494

Used columns: temperature,age
     Weights: [ 3.10956713e-01 -1.75771982e-04]
     Error|Bias: 23.354% | -11.34672551951544

Used columns: temperature,age,weight,height,disabilities
     Weights: [ 3.11020784e-01 -1.84960644e-04  1.44505790e-04  1.36902333e-04
 -5.77584734e-04]
     Error|Bias: 23.551% | -11.380833646552336

Used columns: temperature,breath_problem
     Weights: [0.20389463 0.44586553]
     Error|Bias: 14.834% | -7.442504018332901

Used columns: temperature,age,breath_problem
     Weights: [2.03880936e-01 1.58858514e-04 4.46220667e-01]
     Error|Bias: 15.015% | -7.450460438595111

Used columns: temperature,age,weight,height,breath_problem
     Weights: [2.03943558e-01 1.47422198e-04 1.43376889e-05 1.44331925e-04
 4.46203586e-01]
     Error|Bias: 15.191% | -7.4753831708820275

Used columns: age,height,weight,temperature,breath_problem,children,country,work,sex,disabilit

### 2. Let's change some rows with NA data and unusual values:

In [17]:
data.loc[100, 'age'] = None
data.loc[200, 'age'] = None
data.loc[200, 'height'] = None
data.loc[300, 'weight'] = None
data.loc[400, 'weight'] = None
data.loc[200, 'weight'] = None

data.loc[500, 'weight'] = 998
data.loc[501, 'weight'] = 8
data.loc[600, 'temperature'] = 96.6
data.loc[600, 'breath_problem'] = 0
data.loc[601, 'temperature'] = 11.6
data.loc[700, 'age'] = 258

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    998 non-null    float64
 1   height                 999 non-null    float64
 2   weight                 997 non-null    float64
 3   temperature            1000 non-null   float64
 4   breath_problem         1000 non-null   int64  
 5   children               1000 non-null   int64  
 6   country                1000 non-null   int64  
 7   work                   1000 non-null   int64  
 8   sex                    1000 non-null   int64  
 9   disabilities           1000 non-null   int64  
 10  has_covid              1000 non-null   int64  
 11  healer_name_Ivanov     1000 non-null   uint8  
 12  healer_name_Petrova    1000 non-null   uint8  
 13  healer_name_Sidorov    1000 non-null   uint8  
 14  healer_name_Yakovleva  1000 non-null   uint8  
 15  city_

In [18]:
# Clear from NA only

data = data[pd.isnull(data['age']) == 0]
data = data[pd.isnull(data['height']) == 0]
data = data[pd.isnull(data['weight']) == 0]


### Try to make test with incorrect data, later will filter them

In [19]:
# with incorrect data

make_model_test(data[:])

Used columns: temperature
     Weights: [0.09311606]
     Error|Bias: 42.085% | -3.0513031358855858

Used columns: temperature,age
     Weights: [ 0.0933518  -0.00085199]
     Error|Bias: 41.026% | -3.0153437795619817

Used columns: temperature,age,weight,height,disabilities
     Weights: [ 9.36252684e-02 -8.17461648e-04  2.91467801e-04  7.07752096e-05
  2.01657115e-02]
     Error|Bias: 40.989% | -3.1192440128558307

Used columns: temperature,breath_problem
     Weights: [0.04736845 0.72619093]
     Error|Bias: 16.981% | -1.5802783013044008

Used columns: temperature,age,breath_problem
     Weights: [4.72390886e-02 2.68388633e-04 7.27065555e-01]
     Error|Bias: 17.229% | -1.5898342672358972

Used columns: temperature,age,weight,height,breath_problem
     Weights: [ 4.72671819e-02  2.61416534e-04 -3.31351484e-05  1.24510940e-04
  7.27216801e-01]
     Error|Bias: 17.372% | -1.6064377750265066

Used columns: age,height,weight,temperature,breath_problem,children,country,work,sex,disabilit

### Let's clear from incorrect data

In [20]:
data = data[data['age'] < 100]
data = data[data['age'] > 10]
data = data[data['weight'] < 200]
data = data[data['weight'] > 20]
data = data[data['temperature'] < 42]
data = data[data['temperature'] > 36]

data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 991 entries, 0 to 999
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   age                    991 non-null    float64
 1   height                 991 non-null    float64
 2   weight                 991 non-null    float64
 3   temperature            991 non-null    float64
 4   breath_problem         991 non-null    int64  
 5   children               991 non-null    int64  
 6   country                991 non-null    int64  
 7   work                   991 non-null    int64  
 8   sex                    991 non-null    int64  
 9   disabilities           991 non-null    int64  
 10  has_covid              991 non-null    int64  
 11  healer_name_Ivanov     991 non-null    uint8  
 12  healer_name_Petrova    991 non-null    uint8  
 13  healer_name_Sidorov    991 non-null    uint8  
 14  healer_name_Yakovleva  991 non-null    uint8  
 15  city_r

In [21]:
# Let's see results in filtered data

make_model_test(data[:])

Used columns: temperature
     Weights: [0.31102225]
     Error|Bias: 23.563% | -11.357142445581884

Used columns: temperature,age
     Weights: [ 3.10905471e-01 -2.23718202e-04]
     Error|Bias: 23.3% | -11.34093291640482

Used columns: temperature,age,weight,height,disabilities
     Weights: [ 3.10978449e-01 -2.32330516e-04  1.55017839e-04  1.66001608e-04
  8.07272037e-04]
     Error|Bias: 23.524% | -11.384472379961181

Used columns: temperature,breath_problem
     Weights: [0.20365087 0.44645089]
     Error|Bias: 14.832% | -7.432474268417978

Used columns: temperature,age,breath_problem
     Weights: [2.03650565e-01 1.03777479e-04 4.46677406e-01]
     Error|Bias: 14.949% | -7.43800219953728

Used columns: temperature,age,weight,height,breath_problem
     Weights: [2.03706570e-01 8.73323967e-05 1.06895785e-05 1.85739080e-04
 4.46698299e-01]
     Error|Bias: 15.173% | -7.4683452527535135

Used columns: age,height,weight,temperature,breath_problem,children,country,work,sex,disabilities