In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

# 查看数据 

In [2]:
data = pd.read_csv('../data/train/train.csv')
data.head(5)

Unnamed: 0,id_num,program_type,program_id,program_duration,test_id,test_type,difficulty_level,trainee_id,gender,education,city_tier,age,total_programs_enrolled,is_handicapped,trainee_engagement_rating,is_pass
0,9389_150,Y,Y_1,136.0,150.0,offline,intermediate,9389.0,M,Matriculation,3.0,24.0,5.0,N,1.0,0
1,16523_44,T,T_1,131.0,44.0,offline,easy,16523.0,F,High School Diploma,4.0,26.0,2.0,N,3.0,1
2,13987_178,Z,Z_2,120.0,178.0,online,easy,13987.0,M,Matriculation,1.0,40.0,1.0,N,2.0,1
3,13158_32,T,T_2,117.0,32.0,offline,easy,13158.0,F,Matriculation,3.0,,4.0,N,1.0,1
4,10591_84,V,V_3,131.0,84.0,offline,intermediate,10591.0,F,High School Diploma,1.0,42.0,2.0,N,4.0,1


预测is_pass，**是二元分类问题**，打算采取逻辑回归算法

In [3]:
data.describe()
# 这里只能看到一些数值型特征的统计

Unnamed: 0,program_duration,test_id,trainee_id,city_tier,age,total_programs_enrolled,trainee_engagement_rating,is_pass
count,49323.0,49273.0,49259.0,49298.0,30619.0,49306.0,49226.0,49998.0
mean,128.229366,91.414345,9863.493128,2.249097,36.514256,2.583114,2.397818,0.696288
std,6.889967,51.307852,5716.49064,1.010896,9.045487,1.239399,1.326378,0.459864
min,117.0,0.0,1.0,1.0,17.0,1.0,1.0,0.0
25%,121.0,45.0,5051.5,1.0,28.0,2.0,1.0,0.0
50%,131.0,91.0,9665.0,2.0,40.0,2.0,2.0,1.0
75%,134.0,135.0,14618.0,3.0,45.0,3.0,4.0,1.0
max,136.0,187.0,20097.0,4.0,63.0,14.0,5.0,1.0


此份数据集似乎没有异常值

In [4]:
# 利用data.info()查看缺失值
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49998 entries, 0 to 49997
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id_num                     49998 non-null  object 
 1   program_type               49267 non-null  object 
 2   program_id                 49299 non-null  object 
 3   program_duration           49323 non-null  float64
 4   test_id                    49273 non-null  float64
 5   test_type                  49296 non-null  object 
 6   difficulty_level           49295 non-null  object 
 7   trainee_id                 49259 non-null  float64
 8   gender                     49291 non-null  object 
 9   education                  49296 non-null  object 
 10  city_tier                  49298 non-null  float64
 11  age                        30619 non-null  float64
 12  total_programs_enrolled    49306 non-null  float64
 13  is_handicapped             49280 non-null  obj

每个特征都有一定的缺失，但除了age特征，其他特征缺失都并不多。

In [5]:
data.isnull().sum()
# 先总体看一下缺失值情况

id_num                           0
program_type                   731
program_id                     699
program_duration               675
test_id                        725
test_type                      702
difficulty_level               703
trainee_id                     739
gender                         707
education                      702
city_tier                      700
age                          19379
total_programs_enrolled        692
is_handicapped                 718
trainee_engagement_rating      772
is_pass                          0
dtype: int64

In [None]:
# 利用missingno库可视化缺失值情况
msno.matrix(data)

<AxesSubplot:>

**白线越多，缺失值越多**，再次说明了其余都是个别有缺失值，age则是大部分。

In [None]:
# 绘制热力图，查看缺失值间关系
msno.heatmap(data)

图表明缺失值间出现没有相关性

# 缺失值处理

缺失值如果是**数值型特征**，可以考虑**用平均值填充**。如果是字符串型特征，可以考虑删除该项id。上限：0.70883

新的思考角度：可以用其他特征来预测缺失值，如果是数值型特征用线性回归，分类型特征用逻辑回归。（做了之后并不觉得可行）

In [None]:
data['age'] = data['age'].fillna(data['age'].mean())
data['program_duration'] = data['program_duration'].fillna(data['program_duration'].mean())
data.isnull().sum()

In [None]:
# 其他特征如果有缺失则删除，因为其他特征是用于分类，分错类会产生误导
orilen = len(data.index)
data = data.dropna()
aftlen = len(data.index)
print('共删去%d条数据' % (orilen-aftlen))

# 特征离散化

In [None]:
data['test_type'].unique()

In [None]:
test_type_mapping = {
    'offline': 0,
    'online': 1
}
data['test_type'] = data['test_type'].map(test_type_mapping)

In [None]:
data['gender'].unique()

In [None]:
gender_mapping = {
    'M': 0,
    'F': 1
}
data['gender'] = data['gender'].map(gender_mapping)

In [None]:
data['education'].unique()

In [None]:
education_mapping = {
    'No Qualification': 1,
    'High School Diploma': 2,
    'Matriculation': 3,
    'Bachelors': 4,
    'Masters': 5
}
data['education'] = data['education'].map(education_mapping)

In [None]:
data['difficulty_level'].unique()

In [None]:
difficulty_mapping = {
    'easy': 1,
    'intermediate': 2,
    'hard': 3,
    'vary hard': 4
}
data['difficulty_level'] = data['difficulty_level'].map(difficulty_mapping)

In [None]:
data['is_handicapped'].unique()

In [None]:
handicap_mapping = {
    'N': 0,
    'Y': 1
}
data['is_handicapped'] = data['is_handicapped'].map(handicap_mapping)

# 特征工程

In [None]:
# 查看系数矩阵
data.corr()
# 可看出test_type，age，trainee_engagement_rating与is_pass有较强的相关性

In [None]:
# 先删去无用的特征
data = data.drop(['id_num', 'program_type', 'test_id', 'trainee_id'], axis=1)

trainee_id和test_id为**无序的离散值**,我考虑直接删除掉。而programme_type和programme_id有一定的顺序规律，对is_pass有影响，又因为programe_id值也表明了programe_type的身份（例Y_1表明了他programe_type为Y），所以可以删去programe_type，**用独热编码来表达programe_id**。

In [None]:
data.head(5)

In [None]:
program_id_onehot = pd.get_dummies(data['program_id'])
onehoted = pd.concat([data, program_id_onehot], axis=1)
onehoted = onehoted.drop('program_id', axis=1)

In [None]:
onehoted.head(5)

# 归一化

program_duration和age需进行归一化

In [None]:
onehoted.loc[:, 'age'] /= onehoted['age'].mean()
onehoted.loc[:, 'program_duration'] /= onehoted['program_duration'].mean()
onehoted.head(5)
# 完成归一化

# 算法拟合

In [None]:
from LogisticRegression import LogisticRegression # 自己写的逻辑回归
from Evaluate import accuracy
from sklearn.model_selection import KFold

In [None]:
kf = KFold(n_splits=4,shuffle=True)

In [None]:
lr = LogisticRegression(times=1000,alpha=0.25)
thetas = [] # 存放k折验证每一次的theta
corrects = [] # 存放准确率

In [None]:
X = onehoted.drop('is_pass', axis=1)
Y = onehoted['is_pass']

### 用两种分割数据集的方法进行测试，选择效果最好的theta

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(np.array(X), np.array(Y), test_size=0.2)
lr.fit(x_train, y_train)
thetas.append(lr.theta)

In [None]:
ypre = lr.predict(x_test)
correct = accuracy(ypre, y_test)
corrects.append(correct)
print(correct)

In [None]:
for train_index, test_index in kf.split(X):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = Y.iloc[train_index], Y.iloc[test_index]
    lr.fit(x_train, y_train)
    thetas.append(lr.theta)
    y_pre = lr.predict(x_test)
    correct = accuracy(y_test, y_pre)
    corrects.append(correct)
    print('准确率为:%f' % correct)

In [None]:
best_index = corrects.index(max(corrects))
best_theta = thetas[best_index]
print('准确率最高的theta为第%d个' % (best_index+1))

# 预测test1

In [None]:
# 导入test文件
test1 = pd.read_csv('../data/test/test1.csv')

In [None]:
test1.head(5)

In [None]:
test1.isnull().sum()

In [None]:
# 缺失值处理
test1['age'] = test1['age'].fillna(test1['age'].mean())
test1['trainee_engagement_rating'] = test1['trainee_engagement_rating'].fillna(test1['trainee_engagement_rating'].mean())

In [None]:
# 特征离散化
test1['test_type'] = test1['test_type'].map(test_type_mapping)
test1['gender'] = test1['gender'].map(gender_mapping)
test1['education'] = test1['education'].map(education_mapping)
test1['difficulty_level'] = test1['difficulty_level'].map(difficulty_mapping)
test1['is_handicapped'] = test1['is_handicapped'].map(handicap_mapping)

In [None]:
# 删除无用特征
record = test1['id_num'] # 记录下id_num
test1 = test1.drop(['id_num', 'program_type', 'test_id', 'trainee_id', 'is_pass'], axis=1)

# 对program_id进行独热编码，拼接到原矩阵，删除原program_id特征
test1_program_id_onehot = pd.get_dummies(test1['program_id'])
test1_onehoted = pd.concat([test1, test1_program_id_onehot], axis=1)
test1_onehoted = test1_onehoted.drop('program_id', axis=1)

# 归一化
test1_onehoted.loc[:, 'age'] /= test1_onehoted['age'].mean()
test1_onehoted.loc[:, 'program_duration'] /= test1_onehoted['program_duration'].mean()

**进行预测**

In [None]:
X_test = np.array(test1_onehoted.astype(float))

# 将best_theta设置为模型当前的theta，随后进行预测
lr.theta = best_theta
y_pre = lr.predict(X_test)

In [None]:
submit = pd.DataFrame(y_pre)
submission = pd.concat([record, submit], axis=1) # record记录着原来的id_num
submission.columns = ['id_num', 'is_pass']

In [None]:
# submission.to_csv('sub_2ways_4KFold_1k_alpha0.25.csv', index=False)

# 预测test2

In [None]:
# 导入test文件
test2 = pd.read_csv('../data/test/test2.csv')
test2.head(5)

In [None]:
test2.isnull().sum()

In [None]:
# 缺失值处理
test2['age'] = test2['age'].fillna(test2['age'].mean())
test2['trainee_engagement_rating'] = test2['trainee_engagement_rating'].fillna(test2['trainee_engagement_rating'].mean())

In [None]:
# 特征离散化
test2['test_type'] = test2['test_type'].map(test_type_mapping)
test2['gender'] = test2['gender'].map(gender_mapping)
test2['education'] = test2['education'].map(education_mapping)
test2['difficulty_level'] = test2['difficulty_level'].map(difficulty_mapping)
test2['is_handicapped'] = test2['is_handicapped'].map(handicap_mapping)

In [None]:
# 删除无用特征
record2 = test2['id_num'] # 记录下id_num
test2 = test2.drop(['id_num', 'program_type', 'test_id', 'trainee_id', 'is_pass'], axis=1)

# 对program_id进行独热编码，拼接到原矩阵，删除原program_id特征
test2_program_id_onehot = pd.get_dummies(test2['program_id'])
test2_onehoted = pd.concat([test2, test2_program_id_onehot], axis=1)
test2_onehoted = test2_onehoted.drop('program_id', axis=1)

# 归一化
test2_onehoted.loc[:, 'age'] /= test2_onehoted['age'].mean()
test2_onehoted.loc[:, 'program_duration'] /= test2_onehoted['program_duration'].mean()

**进行预测**

In [None]:
X_test2 = np.array(test2_onehoted.astype(float))

# 将best_theta设置为模型当前的theta，随后进行预测
lr.theta = best_theta
y_pre2 = lr.predict(X_test2)

In [None]:
submit2 = pd.DataFrame(y_pre2)
submission2 = pd.concat([record2, submit], axis=1) # record记录着原来的id_num
submission2.columns = ['id_num', 'is_pass']

In [None]:
submission2.to_csv('sub2_2ways_4KFold_1k_alpha0.25.csv', index=False)