1. 机器学习面试题：
任务：
使用线性回归、多项式回归和决策树回归预测学生期末成绩，并比较模型性能。数据集采用UCI机器学习库中的"学生表现数据集"。

实现步骤：
1. 将数据加载到Pandas DataFrame
2. 分离特征(X)和目标变量(y)
3. 按80-20划分训练测试集(random_state=42)
4. 使用StandardScaler标准化特征
5. 训练以下模型：
   - 线性回归
   - 决策树回归
   - 多项式回归(degree=2)
6. 计算各模型的MSE和R2分数
7. 输出评估指标

In [77]:
import pandas as pd
data = pd.read_csv('./datasets/student-mat.csv', sep=';')

In [78]:
data.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

In [80]:
data.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [81]:
corr_matrix = data.corr(numeric_only=True)
corr_matrix['G3'].sort_values()

failures     -0.360415
age          -0.161579
goout        -0.132791
traveltime   -0.117142
health       -0.061335
Dalc         -0.054660
Walc         -0.051939
freetime      0.011307
absences      0.034247
famrel        0.051363
studytime     0.097820
Fedu          0.152457
Medu          0.217147
G1            0.801468
G2            0.904868
G3            1.000000
Name: G3, dtype: float64

In [82]:
X = data.iloc[:, :-3]
y = data.iloc[:, -1]

In [83]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

preprocessing = ColumnTransformer([("cat",OneHotEncoder(handle_unknown="ignore"), make_column_selector(dtype_include=object))],
                                    remainder=StandardScaler())

In [85]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("linear", LinearRegression()),
])
full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)
linear_rmse = mean_squared_error(y_test,y_pred, squared=False)
linear_r2 = r2_score(y_test,y_pred)
print(f'RMSE:{linear_rmse},R2:{linear_rmse}')

RMSE:4.300639533321715,R2:4.300639533321715


In [86]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("linear", DecisionTreeRegressor(random_state=42)),
])
full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)
dest_rmse = mean_squared_error(y_test,y_pred, squared=False)
dest_r2 = r2_score(y_test,y_pred)
print(f'RMSE:{dest_rmse},R2:{dest_rmse}')

RMSE:4.844414766551244,R2:4.844414766551244


In [87]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error,r2_score
full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("poly", PolynomialFeatures(degree=2)),
    ("linear", LinearRegression()),
])
full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)
poly_rmse = mean_squared_error(y_test,y_pred, squared=False)
poly_r2 = r2_score(y_test,y_pred)
print(f'RMSE:{poly_rmse},R2:{poly_rmse}')

RMSE:6.773614261806036,R2:6.773614261806036


2. 处理泰坦尼克号数据集，在data/目录里

像之前课程里  对读取加州房价数据所做的那样  解压缩此压缩包。

这将提供两个CSV文件，train.csv和test.csv，可以使用pandas.read_csv()加载它。

最后目标：目标是训练一个可以根据其他列预测Survived（是否存活）列的分类器

In [88]:
import pandas as pd
train = pd.read_csv('./datasets/titanic/train.csv',index_col='PassengerId')
test = pd.read_csv('./datasets/titanic/test.csv',index_col='PassengerId')

In [89]:
train.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [90]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [91]:
train_new = train.drop(['Name','Ticket','Cabin'], axis=1)
test_new = test.drop(['Name','Ticket','Cabin'], axis=1)

In [92]:
X_train = train_new.drop(['Survived'], axis=1)
y_train = train_new['Survived']

In [93]:
train_new.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699113,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526507,1.102743,0.806057,49.693429
min,0.0,1.0,0.4167,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [99]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

preprocessing = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), make_column_selector(dtype_include=object)),

    ("num", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), make_column_selector(dtype_include='number'))
])

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("log", LogisticRegression()),
])

param_dist = {
    'log__C': [0.01, 0.1, 1, 10],
    'log__penalty': ['l1', 'l2'],
    'log__solver': ['liblinear'],
    'log__class_weight': ['balanced', None]
}
rnd_search = RandomizedSearchCV(full_pipeline, param_distributions=param_dist, n_iter=16, cv=5, random_state=42, n_jobs=-1,
   scoring='accuracy' )
rnd_search.fit(X_train, y_train)
rnd_search.best_params_

{'log__solver': 'liblinear',
 'log__penalty': 'l2',
 'log__class_weight': None,
 'log__C': 0.1}

In [102]:
import numpy as np
# 还没看什么意思
# ✅ 1. 获取最佳模型中的逻辑回归系数
best_model = rnd_search.best_estimator_.named_steps['log']
coefficients = best_model.coef_[0]  # 二分类，取第一个类别

# ✅ 2. 从已拟合的 pipeline 中提取预处理器
fitted_preprocessing = rnd_search.best_estimator_.named_steps['preprocessing']

# ✅ 3. 获取 one-hot 编码后的类别特征名
onehot_encoder = fitted_preprocessing.named_transformers_['cat'].named_steps['onehot']
cat_features = X_train.select_dtypes(include=['object']).columns
feature_names_cat = onehot_encoder.get_feature_names_out(cat_features)

# ✅ 4. 获取数值型特征名（保持不变）
num_features = X_train.select_dtypes(include=['number']).columns

# ✅ 5. 合并所有特征名
feature_names = np.concatenate([feature_names_cat, num_features])

# ✅ 6. 创建特征重要性 DataFrame
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Importance': abs(coefficients)
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

print(feature_importance)

      Feature  Importance
0    Sex_male    1.176218
1  Sex_female    1.098997
2      Pclass    0.766376
3         Age    0.415010
4       SibSp    0.285484
5  Embarked_S    0.243800
6        Fare    0.135624
7  Embarked_C    0.111790
8  Embarked_Q    0.054789
9       Parch    0.042227


In [95]:
rnd_search.predict(test_new)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [96]:
survival_by_sex = train.groupby('Sex')['Survived'].mean()
survival_by_sex

Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [97]:
train['Age'].fillna(train['Age'].median(), inplace=True)

train['AgeGroup'] = pd.cut(train['Age'], bins=[0, 10, 18, 60, 100])

survival_by_age = train.groupby('AgeGroup',observed=False)['Survived'].mean()
survival_by_age

AgeGroup
(0, 10]      0.593750
(10, 18]     0.426667
(18, 60]     0.365753
(60, 100]    0.227273
Name: Survived, dtype: float64