# Semi-supervised learning - klasyfikacja

Zbiór danych: https://www.kaggle.com/datasets/abdullah0a/human-age-prediction-synthetic-dataset

Omawiany zbiór danych posiada zmienną wiek, jedynie dla części obserwacji. Semi-supervised learning skupia się na problemach klasyfikacyjnych, więc przekształcimy zmienną wiek na zmienną 0/1.

In [1]:
import pandas as pd
from sklearn.semi_supervised import SelfTrainingClassifier, LabelPropagation
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
# puść ten kod, 
# jeżeli wywołujesz plik  w folderze rozwiąznaia, 
# a ramka danych znajduje się w folderze data
import os 
os.chdir('../')

In [3]:
# Wczytanie danych
df = pd.read_csv('data/df.csv',sep=';')

In [5]:
# Nagłówek
df.head()

Unnamed: 0,Gender,Height (cm),Weight (kg),Blood Pressure (s/d),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),...,Cognitive Function,Mental Health Status,Sleep Patterns,Stress Levels,Pollution Exposure,Sun Exposure,Education Level,Income Level,Age (years),is_retired
0,Male,171.148359,86.185197,151/109,259.465814,29.423017,157.652848,0.132868,0.2,58.786198,...,44.059172,Good,Insomnia,2.797064,5.142344,7.108975,,Medium,89.0,1
1,Male,172.946206,79.641937,134/112,263.630292,26.626847,118.507805,0.629534,0.267312,54.63527,...,45.312298,Good,Normal,9.33993,7.27272,3.918489,Undergraduate,Medium,77.0,1
2,Female,155.945488,49.167058,160/101,207.846206,20.217553,143.58755,0.473487,0.248667,54.564632,...,56.246991,Poor,Insomnia,9.234637,8.500386,5.393408,,Medium,70.0,1
3,Female,169.078298,56.017921,133/94,253.283779,19.59527,137.448581,1.184315,0.513818,79.722963,...,55.196092,Poor,Insomnia,4.693446,7.555511,2.745578,,Low,52.0,0
4,Female,163.758355,73.966304,170/106,236.119899,27.582078,145.328695,0.434562,0.306864,52.479469,...,53.023379,Good,Normal,4.038537,9.429097,3.878435,Undergraduate,High,79.0,1


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 27 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Gender                       6000 non-null   object 
 1   Height (cm)                  6000 non-null   float64
 2   Weight (kg)                  6000 non-null   float64
 3   Blood Pressure (s/d)         6000 non-null   object 
 4   Cholesterol Level (mg/dL)    6000 non-null   float64
 5   BMI                          6000 non-null   float64
 6   Blood Glucose Level (mg/dL)  6000 non-null   float64
 7   Bone Density (g/cm²)         6000 non-null   float64
 8   Vision Sharpness             6000 non-null   float64
 9   Hearing Ability (dB)         6000 non-null   float64
 10  Physical Activity Level      6000 non-null   object 
 11  Smoking Status               6000 non-null   object 
 12  Alcohol Consumption          3598 non-null   object 
 13  Diet              

In [7]:
# Zmienna - czy emeryt
df.is_retired.value_counts()

is_retired
-1    3000
 0    1783
 1    1217
Name: count, dtype: int64

In [8]:
# Braki danych
df.isna().sum()

Gender                            0
Height (cm)                       0
Weight (kg)                       0
Blood Pressure (s/d)              0
Cholesterol Level (mg/dL)         0
BMI                               0
Blood Glucose Level (mg/dL)       0
Bone Density (g/cm²)              0
Vision Sharpness                  0
Hearing Ability (dB)              0
Physical Activity Level           0
Smoking Status                    0
Alcohol Consumption            2402
Diet                              0
Chronic Diseases               2598
Medication Use                 2396
Family History                 2902
Cognitive Function                0
Mental Health Status              0
Sleep Patterns                    0
Stress Levels                     0
Pollution Exposure                0
Sun Exposure                      0
Education Level                1254
Income Level                      0
Age (years)                    3000
is_retired                        0
dtype: int64

In [9]:
# Wytypowanie zmiennych
x_names = df.columns[(df.isna().sum()==0).values]


In [10]:
x_names

Index(['Gender', 'Height (cm)', 'Weight (kg)', 'Blood Pressure (s/d)',
       'Cholesterol Level (mg/dL)', 'BMI', 'Blood Glucose Level (mg/dL)',
       'Bone Density (g/cm²)', 'Vision Sharpness', 'Hearing Ability (dB)',
       'Physical Activity Level', 'Smoking Status', 'Diet',
       'Cognitive Function', 'Mental Health Status', 'Sleep Patterns',
       'Stress Levels', 'Pollution Exposure', 'Sun Exposure', 'Income Level',
       'is_retired'],
      dtype='object')

In [11]:
x_names = x_names[:-1]
x_names

Index(['Gender', 'Height (cm)', 'Weight (kg)', 'Blood Pressure (s/d)',
       'Cholesterol Level (mg/dL)', 'BMI', 'Blood Glucose Level (mg/dL)',
       'Bone Density (g/cm²)', 'Vision Sharpness', 'Hearing Ability (dB)',
       'Physical Activity Level', 'Smoking Status', 'Diet',
       'Cognitive Function', 'Mental Health Status', 'Sleep Patterns',
       'Stress Levels', 'Pollution Exposure', 'Sun Exposure', 'Income Level'],
      dtype='object')

In [15]:
# Nowa ramka danych
df_new = df[x_names].select_dtypes(exclude = 'object')

In [16]:
# Braki danych
df_new.isna().sum()

Height (cm)                    0
Weight (kg)                    0
Cholesterol Level (mg/dL)      0
BMI                            0
Blood Glucose Level (mg/dL)    0
Bone Density (g/cm²)           0
Vision Sharpness               0
Hearing Ability (dB)           0
Cognitive Function             0
Stress Levels                  0
Pollution Exposure             0
Sun Exposure                   0
dtype: int64

In [17]:
# Normalizacja
scaler = pd.DataFrame(data = MinMaxScaler().fit_transform(df_new), columns = df_new.columns)

In [18]:
scaler.head()

Unnamed: 0,Height (cm),Weight (kg),Cholesterol Level (mg/dL),BMI,Blood Glucose Level (mg/dL),Bone Density (g/cm²),Vision Sharpness,Hearing Ability (dB),Cognitive Function,Stress Levels,Pollution Exposure,Sun Exposure
0,0.526794,0.589139,0.606361,0.555407,0.757629,0.158881,0.0,0.62536,0.17973,0.199717,0.514022,0.592715
1,0.558346,0.517283,0.629182,0.466015,0.419791,0.382643,0.078039,0.581203,0.196198,0.927034,0.727236,0.326629
2,0.259989,0.182618,0.323497,0.261114,0.63624,0.31234,0.056424,0.580451,0.33989,0.91533,0.850105,0.449637
3,0.490465,0.257852,0.572485,0.241221,0.583258,0.632588,0.363831,0.848082,0.326081,0.410523,0.755539,0.228809
4,0.397102,0.454955,0.478431,0.496553,0.651267,0.294803,0.123895,0.55827,0.297529,0.337722,0.943053,0.323289


In [32]:
# train / test x
test_x = df_new.loc[df['is_retired']!=-1, :].sample(frac=0.3)
train_x = df_new.loc[~(df_new.index.isin(test_x.index)),:]

In [33]:
# train / test y
train_y = df.loc[train_x.index,'is_retired']
test_y = df.loc[test_x.index,'is_retired']

In [22]:
# Model klasyfikacyjny
base = DecisionTreeClassifier(max_depth=20,min_samples_leaf=5)

In [39]:
# Self training model
m1 = SelfTrainingClassifier(base_estimator=base, threshold=0.8).fit(train_x, train_y)

In [40]:
# Finalne wartości y do modelowania
pd.Series(m1.transduction_).value_counts()

 0    2994
 1    1987
-1     119
Name: count, dtype: int64

In [41]:
train_y.value_counts()

is_retired
-1    3000
 0    1254
 1     846
Name: count, dtype: int64

In [43]:
# Liczba iteracji potrzebna do wyznaczenia klasy rekordom bez wartości y 
pd.Series(m1.labeled_iter_).value_counts()

 1    2710
 0    2100
-1     119
 2     115
 3      32
 4      19
 5       5
Name: count, dtype: int64

In [45]:
# Dodanie uzupełnionego y do danych
labels = pd.DataFrame(train_y, columns = ['is_retired'])
labels['filled_label'] = m1.transduction_

In [47]:
# dodanie mianownika
labels['nominator'] = 1

In [48]:
# Tabela przestawna
labels.pivot_table(index= 'is_retired', columns='filled_label', values='nominator', aggfunc='count')

filled_label,-1,0,1
is_retired,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1,119.0,1740.0,1141.0
0,,1254.0,
1,,,846.0


In [50]:
test_pred= m1.predict(test_x)

In [52]:
m1.score(test_x,test_y)

0.9255555555555556

## Label Propagation 
Dokumentacja: https://scikit-learn.org/1.5/modules/generated/sklearn.semi_supervised.LabelPropagation.html

In [None]:
# Uzupełnienie danych  - labelpropagation interpretuje -1 jako braki danych


In [53]:
# Model 
m2 = LabelPropagation(kernel='knn',n_neighbors=10).fit(train_x, train_y)

In [54]:
# Dystrybucja y
m2.label_distributions_

array([[0.        , 1.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       ...,
       [0.48836935, 0.51163065],
       [0.85683577, 0.14316423],
       [0.06612841, 0.93387159]])

In [55]:
# przypisana wartość
pd.Series(m2.transduction_).value_counts()

0    3069
1    2031
Name: count, dtype: int64

In [56]:
# Predict
test_pred = m2.predict(test_x)

In [57]:
test_pred

array([1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1,

In [58]:
# Predict proba
m2.predict_proba(test_x)

array([[0.02726737, 0.97273263],
       [0.10402054, 0.89597946],
       [0.1268838 , 0.8731162 ],
       ...,
       [0.03117678, 0.96882322],
       [0.46721853, 0.53278147],
       [0.99810417, 0.00189583]])

In [59]:
# score
m2.score(test_x,test_y)

0.8166666666666667