In [101]:
import pandas as pd
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, precision_recall_curve, roc_curve, roc_auc_score
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
from imblearn.over_sampling import SMOTE

In [102]:
df=pd.read_csv('grant_data_imb.csv')
pd.set_option('display.max_columns',None)

In [103]:
df.sample()

Unnamed: 0,Grant.Status,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,RFCD.Code.4,RFCD.Percentage.4,RFCD.Code.5,RFCD.Percentage.5,SEO.Code.1,SEO.Percentage.1,SEO.Code.2,SEO.Percentage.2,SEO.Code.3,SEO.Percentage.3,SEO.Code.4,SEO.Percentage.4,SEO.Code.5,SEO.Percentage.5,Person.ID.1,Role.1,Year.of.Birth.1,Country.of.Birth.1,Home.Language.1,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1
3631,0,18B,20A,G,321216.0,60.0,321206.0,20.0,321204.0,20.0,0.0,0.0,0.0,0.0,730217.0,40.0,730219.0,40.0,730213.0,20.0,0.0,0.0,0.0,0.0,84622.0,CHIEF_INVESTIGATOR,1955.0,Australia,,2973.0,25.0,,>=0 to 5,0.0,5.0,8.0,10.0,2.0,5.0


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4113 entries, 0 to 4112
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Grant.Status                            4113 non-null   int64  
 1   Sponsor.Code                            3856 non-null   object 
 2   Grant.Category.Code                     3856 non-null   object 
 3   Contract.Value.Band...see.note.A        1953 non-null   object 
 4   RFCD.Code.1                             3853 non-null   float64
 5   RFCD.Percentage.1                       3853 non-null   float64
 6   RFCD.Code.2                             3853 non-null   float64
 7   RFCD.Percentage.2                       3853 non-null   float64
 8   RFCD.Code.3                             3853 non-null   float64
 9   RFCD.Percentage.3                       3853 non-null   float64
 10  RFCD.Code.4                             3853 non-null   floa

In [105]:
target = df['Grant.Status']
features = df.drop(columns=['Grant.Status'])

In [106]:
target.value_counts()

Grant.Status
0    3259
1     854
Name: count, dtype: int64

In [107]:
def upsample(features, target, repaet=10):
    features_zeros=features[target==0]
    features_ones=features[target==1]
    target_zeros=target[target==0]
    target_ones=target[target==1]
    if len(target_ones)>len(target_zeros):
        repaet=round(len(target_ones)/len(target_zeros))
        features_upsampled=pd.concat([features_ones]+ [features_zeros]*repaet)
        target_upsampled=pd.concat([target_ones]+[target_zeros]*repaet)
    else:
        repaet=round(len(target_zeros)/len(target_ones))
        features_upsampled=pd.concat([features_zeros]+ [features_ones]*repaet)
        target_upsampled=pd.concat([target_zeros]+[target_ones]*repaet)
    features_upsampled, target_upsampled=shuffle(features_upsampled, target_upsampled, random_state=23)
    return features_upsampled, target_upsampled


In [108]:
features_train_upsampled, target_train_upsampled=upsample(features, target)

In [109]:
numerical_features = features.select_dtypes(include=['float64', 'int64']).columns

# Заполнение пропусков средними значениями и нулями
for feature in numerical_features:
    features[feature + '_mean'] = features[feature].fillna(features[feature].mean())
    features[feature + '_zero'] = features[feature].fillna(0)

In [110]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4113 entries, 0 to 4112
Data columns (total 98 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Sponsor.Code                            3856 non-null   object 
 1   Grant.Category.Code                     3856 non-null   object 
 2   Contract.Value.Band...see.note.A        1953 non-null   object 
 3   RFCD.Code.1                             3853 non-null   float64
 4   RFCD.Percentage.1                       3853 non-null   float64
 5   RFCD.Code.2                             3853 non-null   float64
 6   RFCD.Percentage.2                       3853 non-null   float64
 7   RFCD.Code.3                             3853 non-null   float64
 8   RFCD.Percentage.3                       3853 non-null   float64
 9   RFCD.Code.4                             3853 non-null   float64
 10  RFCD.Percentage.4                       3853 non-null   floa

In [111]:
features['With.PHD.1'].fillna('No', inplace=True)
features

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features['With.PHD.1'].fillna('No', inplace=True)


Unnamed: 0,Sponsor.Code,Grant.Category.Code,Contract.Value.Band...see.note.A,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,RFCD.Code.4,RFCD.Percentage.4,RFCD.Code.5,RFCD.Percentage.5,SEO.Code.1,SEO.Percentage.1,SEO.Code.2,SEO.Percentage.2,SEO.Code.3,SEO.Percentage.3,SEO.Code.4,SEO.Percentage.4,SEO.Code.5,SEO.Percentage.5,Person.ID.1,Role.1,Year.of.Birth.1,Country.of.Birth.1,Home.Language.1,Dept.No..1,Faculty.No..1,With.PHD.1,No..of.Years.in.Uni.at.Time.of.Grant.1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1,RFCD.Code.1_mean,RFCD.Code.1_zero,RFCD.Percentage.1_mean,RFCD.Percentage.1_zero,RFCD.Code.2_mean,RFCD.Code.2_zero,RFCD.Percentage.2_mean,RFCD.Percentage.2_zero,RFCD.Code.3_mean,RFCD.Code.3_zero,RFCD.Percentage.3_mean,RFCD.Percentage.3_zero,RFCD.Code.4_mean,RFCD.Code.4_zero,RFCD.Percentage.4_mean,RFCD.Percentage.4_zero,RFCD.Code.5_mean,RFCD.Code.5_zero,RFCD.Percentage.5_mean,RFCD.Percentage.5_zero,SEO.Code.1_mean,SEO.Code.1_zero,SEO.Percentage.1_mean,SEO.Percentage.1_zero,SEO.Code.2_mean,SEO.Code.2_zero,SEO.Percentage.2_mean,SEO.Percentage.2_zero,SEO.Code.3_mean,SEO.Code.3_zero,SEO.Percentage.3_mean,SEO.Percentage.3_zero,SEO.Code.4_mean,SEO.Code.4_zero,SEO.Percentage.4_mean,SEO.Percentage.4_zero,SEO.Code.5_mean,SEO.Code.5_zero,SEO.Percentage.5_mean,SEO.Percentage.5_zero,Person.ID.1_mean,Person.ID.1_zero,Year.of.Birth.1_mean,Year.of.Birth.1_zero,Dept.No..1_mean,Dept.No..1_zero,Faculty.No..1_mean,Faculty.No..1_zero,Number.of.Successful.Grant.1_mean,Number.of.Successful.Grant.1_zero,Number.of.Unsuccessful.Grant.1_mean,Number.of.Unsuccessful.Grant.1_zero,A..1_mean,A..1_zero,A.1_mean,A.1_zero,B.1_mean,B.1_zero,C.1_mean,C.1_zero
0,97A,30B,A,321024.0,50.0,321013.0,30.0,291502.0,20.0,0.0,0.0,0.0,0.0,730303.0,50.0,730104.0,30.0,671402.0,20.0,0.0,0.0,0.0,0.0,67037.0,CHIEF_INVESTIGATOR,1950.0,Eastern Europe,,2563.0,25.0,Yes,>10 to 15,2.0,6.0,3.0,5.0,15.0,3.0,321024.000000,321024.0,50.00000,50.0,321013.000000,321013.0,30.000000,30.0,291502.000000,291502.0,20.000000,20.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,730303.00000,730303.0,50.000000,50.0,730104.00000,730104.0,30.000000,30.0,671402.000000,671402.0,20.000000,20.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,67037.0,67037.0,1950.0,1950.0,2563.0,2563.0,25.0,25.0,2.0,2.0,6.0,6.0,3.0,3.0,5.0,5.0,15.0,15.0,3.0,3.0
1,36D,10A,G,300201.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,620108.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12917.0,PRINCIPAL_SUPERVISOR,1955.0,Australia,,1038.0,1.0,No,Less than 0,0.0,3.0,0.0,4.0,0.0,0.0,300201.000000,300201.0,100.00000,100.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,620108.00000,620108.0,100.000000,100.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,12917.0,12917.0,1955.0,1955.0,1038.0,1038.0,1.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,0.0
2,317A,30D,,321013.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730104.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,28607.0,CHIEF_INVESTIGATOR,1965.0,Great Britain,,2763.0,25.0,Yes,>5 to 10,4.0,3.0,6.0,25.0,14.0,14.0,321013.000000,321013.0,100.00000,100.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,730104.00000,730104.0,100.000000,100.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,28607.0,28607.0,1965.0,1965.0,2763.0,2763.0,25.0,25.0,4.0,4.0,3.0,3.0,6.0,6.0,25.0,25.0,14.0,14.0,14.0,14.0
3,62B,10B,B,321103.0,30.0,321105.0,40.0,321204.0,30.0,0.0,0.0,0.0,0.0,730211.0,50.0,730302.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,144632.0,CHIEF_INVESTIGATOR,1960.0,Australia,,2848.0,25.0,No,Less than 0,1.0,2.0,1.0,0.0,0.0,0.0,321103.000000,321103.0,30.00000,30.0,321105.000000,321105.0,40.000000,40.0,321204.000000,321204.0,30.000000,30.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,730211.00000,730211.0,50.000000,50.0,730302.00000,730302.0,50.000000,50.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,144632.0,144632.0,1960.0,1960.0,2848.0,2848.0,25.0,25.0,1.0,1.0,2.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1A,10A,,270603.0,60.0,321205.0,30.0,320603.0,10.0,0.0,0.0,0.0,0.0,730114.0,60.0,730204.0,10.0,730215.0,30.0,0.0,0.0,0.0,0.0,1797.0,CHIEF_INVESTIGATOR,1960.0,North America,,2678.0,25.0,Yes,>5 to 10,5.0,14.0,0.0,9.0,7.0,0.0,270603.000000,270603.0,60.00000,60.0,321205.000000,321205.0,30.000000,30.0,320603.000000,320603.0,10.000000,10.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,730114.00000,730114.0,60.000000,60.0,730204.00000,730204.0,10.000000,10.0,730215.000000,730215.0,30.000000,30.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,1797.0,1797.0,1960.0,1960.0,2678.0,2678.0,25.0,25.0,5.0,5.0,14.0,14.0,0.0,0.0,9.0,9.0,7.0,7.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4108,103C,30B,,321022.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730305.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,747267.0,CHIEF_INVESTIGATOR,1965.0,,,2773.0,25.0,No,>=0 to 5,0.0,1.0,0.0,0.0,0.0,0.0,321022.000000,321022.0,100.00000,100.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,730305.00000,730305.0,100.000000,100.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,747267.0,747267.0,1965.0,1965.0,2773.0,2773.0,25.0,25.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4109,2B,10A,,340208.0,50.0,340499.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,720102.0,50.0,720199.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,64232.0,CHIEF_INVESTIGATOR,1950.0,Asia Pacific,,1678.0,13.0,Yes,more than 15,0.0,1.0,1.0,5.0,8.0,0.0,340208.000000,340208.0,50.00000,50.0,340499.000000,340499.0,50.000000,50.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,720102.00000,720102.0,50.000000,50.0,720199.00000,720199.0,50.000000,50.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,64232.0,64232.0,1950.0,1950.0,1678.0,1678.0,13.0,13.0,0.0,0.0,1.0,1.0,1.0,1.0,5.0,5.0,8.0,8.0,0.0,0.0
4110,2B,10A,,320702.0,40.0,270102.0,30.0,320305.0,30.0,0.0,0.0,0.0,0.0,730203.0,60.0,730104.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,76432.0,CHIEF_INVESTIGATOR,1960.0,Australia,English,2653.0,25.0,No,>=0 to 5,1.0,0.0,26.0,15.0,9.0,6.0,320702.000000,320702.0,40.00000,40.0,270102.000000,270102.0,30.000000,30.0,320305.000000,320305.0,30.000000,30.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,730203.00000,730203.0,60.000000,60.0,730104.00000,730104.0,40.000000,40.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,76432.0,76432.0,1960.0,1960.0,2653.0,2653.0,25.0,25.0,1.0,1.0,0.0,0.0,26.0,26.0,15.0,15.0,9.0,9.0,6.0,6.0
4111,40D,10B,C,,,,,,,,,,,,,,,,,,,,,19072.0,CHIEF_INVESTIGATOR,1975.0,Australia,,3248.0,31.0,No,>=0 to 5,0.0,1.0,1.0,0.0,0.0,0.0,314904.682845,0.0,74.69686,0.0,161386.717104,0.0,17.642616,0.0,96437.197508,0.0,7.089541,0.0,6835.177005,0.0,0.442512,0.0,1767.989878,0.0,0.128471,0.0,729027.71436,0.0,70.557024,0.0,443260.50847,0.0,21.209617,0.0,233810.547824,0.0,7.294866,0.0,32557.909044,0.0,0.719312,0.0,9602.635132,0.0,0.218921,0.0,19072.0,19072.0,1975.0,1975.0,3248.0,3248.0,31.0,31.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
for col in features.columns:
    if (features[col].dtype=='object'):
        features[col].fillna('not indicated', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  features[col].fillna('not indicated', inplace=True)


In [113]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4113 entries, 0 to 4112
Data columns (total 98 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Sponsor.Code                            4113 non-null   object 
 1   Grant.Category.Code                     4113 non-null   object 
 2   Contract.Value.Band...see.note.A        4113 non-null   object 
 3   RFCD.Code.1                             3853 non-null   float64
 4   RFCD.Percentage.1                       3853 non-null   float64
 5   RFCD.Code.2                             3853 non-null   float64
 6   RFCD.Percentage.2                       3853 non-null   float64
 7   RFCD.Code.3                             3853 non-null   float64
 8   RFCD.Percentage.3                       3853 non-null   float64
 9   RFCD.Code.4                             3853 non-null   float64
 10  RFCD.Percentage.4                       3853 non-null   floa

In [114]:
features_one=pd.get_dummies(features, drop_first=True)
features_one.sample()

Unnamed: 0,RFCD.Code.1,RFCD.Percentage.1,RFCD.Code.2,RFCD.Percentage.2,RFCD.Code.3,RFCD.Percentage.3,RFCD.Code.4,RFCD.Percentage.4,RFCD.Code.5,RFCD.Percentage.5,SEO.Code.1,SEO.Percentage.1,SEO.Code.2,SEO.Percentage.2,SEO.Code.3,SEO.Percentage.3,SEO.Code.4,SEO.Percentage.4,SEO.Code.5,SEO.Percentage.5,Person.ID.1,Year.of.Birth.1,Dept.No..1,Faculty.No..1,Number.of.Successful.Grant.1,Number.of.Unsuccessful.Grant.1,A..1,A.1,B.1,C.1,RFCD.Code.1_mean,RFCD.Code.1_zero,RFCD.Percentage.1_mean,RFCD.Percentage.1_zero,RFCD.Code.2_mean,RFCD.Code.2_zero,RFCD.Percentage.2_mean,RFCD.Percentage.2_zero,RFCD.Code.3_mean,RFCD.Code.3_zero,RFCD.Percentage.3_mean,RFCD.Percentage.3_zero,RFCD.Code.4_mean,RFCD.Code.4_zero,RFCD.Percentage.4_mean,RFCD.Percentage.4_zero,RFCD.Code.5_mean,RFCD.Code.5_zero,RFCD.Percentage.5_mean,RFCD.Percentage.5_zero,SEO.Code.1_mean,SEO.Code.1_zero,SEO.Percentage.1_mean,SEO.Percentage.1_zero,SEO.Code.2_mean,SEO.Code.2_zero,SEO.Percentage.2_mean,SEO.Percentage.2_zero,SEO.Code.3_mean,SEO.Code.3_zero,SEO.Percentage.3_mean,SEO.Percentage.3_zero,SEO.Code.4_mean,SEO.Code.4_zero,SEO.Percentage.4_mean,SEO.Percentage.4_zero,SEO.Code.5_mean,SEO.Code.5_zero,SEO.Percentage.5_mean,SEO.Percentage.5_zero,Person.ID.1_mean,Person.ID.1_zero,Year.of.Birth.1_mean,Year.of.Birth.1_zero,Dept.No..1_mean,Dept.No..1_zero,Faculty.No..1_mean,Faculty.No..1_zero,Number.of.Successful.Grant.1_mean,Number.of.Successful.Grant.1_zero,Number.of.Unsuccessful.Grant.1_mean,Number.of.Unsuccessful.Grant.1_zero,A..1_mean,A..1_zero,A.1_mean,A.1_zero,B.1_mean,B.1_zero,C.1_mean,C.1_zero,Sponsor.Code_101A,Sponsor.Code_103C,Sponsor.Code_105A,Sponsor.Code_107C,Sponsor.Code_111C,Sponsor.Code_112D,Sponsor.Code_113A,Sponsor.Code_11C,Sponsor.Code_120D,Sponsor.Code_126B,Sponsor.Code_128D,Sponsor.Code_12D,Sponsor.Code_130B,Sponsor.Code_132D,Sponsor.Code_133A,Sponsor.Code_135C,Sponsor.Code_136D,Sponsor.Code_137A,Sponsor.Code_138B,Sponsor.Code_139C,Sponsor.Code_13A,Sponsor.Code_141A,Sponsor.Code_143C,Sponsor.Code_144D,Sponsor.Code_145A,Sponsor.Code_146B,Sponsor.Code_147C,Sponsor.Code_148D,Sponsor.Code_149A,Sponsor.Code_14B,Sponsor.Code_150B,Sponsor.Code_151C,Sponsor.Code_153A,Sponsor.Code_154B,Sponsor.Code_156D,Sponsor.Code_157A,Sponsor.Code_158B,Sponsor.Code_159C,Sponsor.Code_15C,Sponsor.Code_160D,Sponsor.Code_161A,Sponsor.Code_163C,Sponsor.Code_164D,Sponsor.Code_165A,Sponsor.Code_166B,Sponsor.Code_168D,Sponsor.Code_169A,Sponsor.Code_16D,Sponsor.Code_170B,Sponsor.Code_172D,Sponsor.Code_173A,Sponsor.Code_174B,Sponsor.Code_176D,Sponsor.Code_177A,Sponsor.Code_178B,Sponsor.Code_179C,Sponsor.Code_180D,Sponsor.Code_183C,Sponsor.Code_184D,Sponsor.Code_187C,Sponsor.Code_188D,Sponsor.Code_18B,Sponsor.Code_191C,Sponsor.Code_193A,Sponsor.Code_194B,Sponsor.Code_195C,Sponsor.Code_196D,Sponsor.Code_197A,Sponsor.Code_198B,Sponsor.Code_199C,Sponsor.Code_1A,Sponsor.Code_200D,Sponsor.Code_201A,Sponsor.Code_202B,Sponsor.Code_203C,Sponsor.Code_204D,Sponsor.Code_205A,Sponsor.Code_206B,Sponsor.Code_208D,Sponsor.Code_20D,Sponsor.Code_210B,Sponsor.Code_212D,Sponsor.Code_214B,Sponsor.Code_215C,Sponsor.Code_216D,Sponsor.Code_219C,Sponsor.Code_21A,Sponsor.Code_221A,Sponsor.Code_222B,Sponsor.Code_223C,Sponsor.Code_225A,Sponsor.Code_226B,Sponsor.Code_227C,Sponsor.Code_228D,Sponsor.Code_229A,Sponsor.Code_230B,Sponsor.Code_232D,Sponsor.Code_234B,Sponsor.Code_235C,Sponsor.Code_238B,Sponsor.Code_23C,Sponsor.Code_241A,Sponsor.Code_242B,Sponsor.Code_244D,Sponsor.Code_245A,Sponsor.Code_247C,Sponsor.Code_24D,Sponsor.Code_250B,Sponsor.Code_252D,Sponsor.Code_255C,Sponsor.Code_256D,Sponsor.Code_257A,Sponsor.Code_258B,Sponsor.Code_259C,Sponsor.Code_260D,Sponsor.Code_262B,Sponsor.Code_265A,Sponsor.Code_266B,Sponsor.Code_267C,Sponsor.Code_269A,Sponsor.Code_26B,Sponsor.Code_270B,Sponsor.Code_273A,Sponsor.Code_274B,Sponsor.Code_277A,Sponsor.Code_279C,Sponsor.Code_27C,Sponsor.Code_281A,Sponsor.Code_282B,Sponsor.Code_284D,Sponsor.Code_285A,Sponsor.Code_286B,Sponsor.Code_289A,Sponsor.Code_28D,Sponsor.Code_294B,Sponsor.Code_295C,Sponsor.Code_298B,Sponsor.Code_299C,Sponsor.Code_29A,Sponsor.Code_2B,Sponsor.Code_305A,Sponsor.Code_307C,Sponsor.Code_308D,Sponsor.Code_309A,Sponsor.Code_310B,Sponsor.Code_311C,Sponsor.Code_313A,Sponsor.Code_315C,Sponsor.Code_317A,Sponsor.Code_318B,Sponsor.Code_324D,Sponsor.Code_325A,Sponsor.Code_326B,Sponsor.Code_328D,Sponsor.Code_32D,Sponsor.Code_330B,Sponsor.Code_331C,Sponsor.Code_33A,Sponsor.Code_342B,Sponsor.Code_346B,Sponsor.Code_347C,Sponsor.Code_349A,Sponsor.Code_34B,Sponsor.Code_357A,Sponsor.Code_362B,Sponsor.Code_36D,Sponsor.Code_370B,Sponsor.Code_373A,Sponsor.Code_37A,Sponsor.Code_38B,Sponsor.Code_39C,Sponsor.Code_3C,Sponsor.Code_40D,Sponsor.Code_415C,Sponsor.Code_427C,Sponsor.Code_429A,Sponsor.Code_42B,Sponsor.Code_433A,Sponsor.Code_434B,Sponsor.Code_435C,Sponsor.Code_437A,Sponsor.Code_47C,Sponsor.Code_48D,Sponsor.Code_49A,Sponsor.Code_4D,Sponsor.Code_51C,Sponsor.Code_52D,Sponsor.Code_53A,Sponsor.Code_54B,Sponsor.Code_55C,Sponsor.Code_56D,Sponsor.Code_58B,Sponsor.Code_59C,Sponsor.Code_5A,Sponsor.Code_60D,Sponsor.Code_62B,Sponsor.Code_63C,Sponsor.Code_65A,Sponsor.Code_66B,Sponsor.Code_67C,Sponsor.Code_68D,Sponsor.Code_69A,Sponsor.Code_6B,Sponsor.Code_72D,Sponsor.Code_73A,Sponsor.Code_74B,Sponsor.Code_75C,Sponsor.Code_77A,Sponsor.Code_78B,Sponsor.Code_7C,Sponsor.Code_80D,Sponsor.Code_82B,Sponsor.Code_83C,Sponsor.Code_84D,Sponsor.Code_85A,Sponsor.Code_86B,Sponsor.Code_87C,Sponsor.Code_89A,Sponsor.Code_90B,Sponsor.Code_91C,Sponsor.Code_93A,Sponsor.Code_94B,Sponsor.Code_95C,Sponsor.Code_97A,Sponsor.Code_9A,Sponsor.Code_not indicated,Grant.Category.Code_10B,Grant.Category.Code_20A,Grant.Category.Code_20C,Grant.Category.Code_30A,Grant.Category.Code_30B,Grant.Category.Code_30C,Grant.Category.Code_30D,Grant.Category.Code_30E,Grant.Category.Code_30F,Grant.Category.Code_30G,Grant.Category.Code_40C,Grant.Category.Code_50A,Grant.Category.Code_not indicated,Contract.Value.Band...see.note.A_B,Contract.Value.Band...see.note.A_C,Contract.Value.Band...see.note.A_D,Contract.Value.Band...see.note.A_E,Contract.Value.Band...see.note.A_F,Contract.Value.Band...see.note.A_G,Contract.Value.Band...see.note.A_H,Contract.Value.Band...see.note.A_I,Contract.Value.Band...see.note.A_J,Contract.Value.Band...see.note.A_K,Contract.Value.Band...see.note.A_L,Contract.Value.Band...see.note.A_M,Contract.Value.Band...see.note.A_O,Contract.Value.Band...see.note.A_P,Contract.Value.Band...see.note.A_Q,Contract.Value.Band...see.note.A_not indicated,Role.1_DELEGATED_RESEARCHER,Role.1_EXTERNAL_ADVISOR,Role.1_EXT_CHIEF_INVESTIGATOR,Role.1_HONVISIT,Role.1_PRINCIPAL_SUPERVISOR,Role.1_STUD_CHIEF_INVESTIGATOR,Role.1_not indicated,Country.of.Birth.1_Australia,Country.of.Birth.1_Eastern Europe,Country.of.Birth.1_Great Britain,Country.of.Birth.1_Middle East and Africa,Country.of.Birth.1_New Zealand,Country.of.Birth.1_North America,Country.of.Birth.1_South Africa,Country.of.Birth.1_The Americas,Country.of.Birth.1_Western Europe,Country.of.Birth.1_not indicated,Home.Language.1_Other,Home.Language.1_not indicated,With.PHD.1_Yes,No..of.Years.in.Uni.at.Time.of.Grant.1_>5 to 10,No..of.Years.in.Uni.at.Time.of.Grant.1_>=0 to 5,No..of.Years.in.Uni.at.Time.of.Grant.1_Less than 0,No..of.Years.in.Uni.at.Time.of.Grant.1_more than 15,No..of.Years.in.Uni.at.Time.of.Grant.1_not indicated
3041,321204.0,50.0,321299.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,730211.0,50.0,730299.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,65712.0,1950.0,2523.0,25.0,1.0,6.0,1.0,3.0,6.0,9.0,321204.0,321204.0,50.0,50.0,321299.0,321299.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,730211.0,730211.0,50.0,50.0,730299.0,730299.0,50.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65712.0,65712.0,1950.0,1950.0,2523.0,2523.0,25.0,25.0,1.0,1.0,6.0,6.0,1.0,1.0,3.0,3.0,6.0,6.0,9.0,9.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,True,False


In [115]:
features_train, features_test, target_train, target_test = train_test_split(features_one, target, test_size=0.2, random_state=42)


In [116]:
scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)
features_test_scaled = scaler.transform(features_test)

In [117]:
pd.DataFrame(features_test_scaled,columns=features_train.columns).dropna(inplace=True)

In [118]:
model=LogisticRegressionCV(solver='liblinear',random_state=12,class_weight='balanced',cv=10)
model.fit(features_train_scaled, target_train)
roc_auc_score(target_test, model.predict_proba(features_test_scaled))

ValueError: Input X contains NaN.
LogisticRegressionCV does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values