In [None]:
# 任务：
# 使用线性回归 和决策树回归预测学生期末成绩，并比较模型性能。数据集采用UCI机器学习库中的"学生表现数据集"。  数据文件：student-mat.csv
#
# 实现步骤：
# 1. 将数据加载到Pandas DataFrame
# 2. 分离特征(X)和目标变量(y)
# 3. 按80-20划分训练测试集(random_state=42)
# 4. 使用StandardScaler标准化特征
# 5. 训练以下模型：
#    - 线性回归
#    - 决策树回归
# 6. 计算各模型的MSE分数
# 7. 输出评估指标

In [6]:
import pandas as pd
from nltk import DecisionTreeClassifier
from pygments.lexer import include
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

data =pd.read_csv("../datasets/student/student-mat.csv",delimiter=";")
data.info  #有没有缺失值
print(data)


    school sex  age address famsize Pstatus  Medu  Fedu      Mjob      Fjob  \
0       GP   F   18       U     GT3       A     4     4   at_home   teacher   
1       GP   F   17       U     GT3       T     1     1   at_home     other   
2       GP   F   15       U     LE3       T     1     1   at_home     other   
3       GP   F   15       U     GT3       T     4     2    health  services   
4       GP   F   16       U     GT3       T     3     3     other     other   
..     ...  ..  ...     ...     ...     ...   ...   ...       ...       ...   
390     MS   M   20       U     LE3       A     2     2  services  services   
391     MS   M   17       U     LE3       T     3     1  services  services   
392     MS   M   21       R     GT3       T     1     1     other     other   
393     MS   M   18       R     LE3       T     3     2  services     other   
394     MS   M   19       U     LE3       T     1     1     other   at_home   

     ... famrel freetime  goout  Dalc  Walc health 

In [7]:
X=data.iloc[:,:30]
X
y=data.iloc[:,-1]
y


0       6
1       6
2      10
3      15
4      10
       ..
390     9
391    16
392     7
393    10
394     9
Name: G3, Length: 395, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test=train_test_split(X,test_size=0.2,random_state=42)
y_train,y_test=train_test_split(y,test_size=0.2,random_state=42)

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures
cat_cloumns_name = X_train.select_dtypes(include=['object', 'string']).columns.tolist()

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

preprocessing=ColumnTransformer([
    ("cat",cat_pipeline,cat_cloumns_name)
])

pipeline_linear=Pipeline([
    ("preprocessing", preprocessing),
    ("model_poly",PolynomialFeatures(degree=2)),
    ("stand", StandardScaler()),
    ("model",LinearRegression())
])

pipeline_linear.fit(X_train,y_train)

pipeline_decisiontree=Pipeline([
    ("preprocessing", preprocessing),
    ("model_poly",PolynomialFeatures(degree=2)),
    ("stand", StandardScaler()),
    ("model",DecisionTreeRegressor())
])

pipeline_decisiontree.fit(X_train,y_train)




In [21]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

mse_linear=mean_squared_error(y_test,pipeline_linear.predict(X_test))
mse_linear

mse_decisiontree=mean_squared_error(y_test, pipeline_decisiontree.predict(X_test))
mse_decisiontree

r2_liner=r2_score(y_test,pipeline_linear.predict(X_test))
r2_liner

r2_decisiontree=r2_score(y_test,pipeline_decisiontree.predict(X_test))
r2_decisiontree

-1.6563388866314503

In [None]:
# 2. 处理泰坦尼克号数据集，在data/目录里
#
# 像之前课程里  对读取加州房价数据所做的那样  解压缩此压缩包。
#
# 这将提供两个CSV文件，train.csv和test.csv，可以使用pandas.read_csv()加载它。
#
# 最后目标：目标是训练一个可以根据其他列预测Survived（是否存活）列的分类器

In [59]:
import tarfile
with tarfile.open("HW3/data/titanic.tgz") as tar:
    tar.extractall(path="HW3/data")

In [60]:
test=pd.read_csv("HW3/data/titanic/test.csv")

train=pd.read_csv("HW3/data/titanic/train.csv")

In [61]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [62]:
# ## 数据集的字段说明
# - PassengerId ：  每位乘客的唯一标识符
# - Survived    ：  是否幸存（1：幸存，0：未幸存）
# - Pclass      ：  乘客的舱位等级（1-一等舱，2-二等舱，3-三等舱）
# - Name        ：  姓名
# - Sex         ：  性别，male,female
# - Age         :   年龄
# - SibSp       ：  与乘客一起旅行的兄弟姐妹或配偶的数量
# - Parch       ：  与乘客一起旅行的父母或儿童人数
# - Ticket      ：  票证号
# - Fare        ：  票价
# - Cabin       ：  乘客所住的客舱编号
# - Embarked    ：  乘客登船的港口（C-瑟堡（法国），Q-皇后镇（爱尔兰），S-南安普顿（英格兰））

In [76]:
columns_name=["Pclass","Sex","Age","SibSp","Parch","Fare","Cabin","Embarked"]

X=train.loc[:, columns_name]

y=train.loc[:, "Survived"]

In [64]:
X.info()  #查看是否有缺失值

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Cabin     204 non-null    object 
 7   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(3)
memory usage: 55.8+ KB


In [65]:
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer, make_column_selector
import numpy as np
cat_cloumns_name = X.select_dtypes(include=['object', 'string']).columns.tolist()

num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

preprocessing=ColumnTransformer([
    ("num",  num_pipeline,  make_column_selector(dtype_include=np.number)),
    ("cat", OneHotEncoder(handle_unknown="ignore"), ['Sex', 'Cabin', 'Embarked']),
])

#  ColumnTransformer    是并行的  所以前面要先写一个流水线处理数据  完事以后用ColumnTransformer
#


# preprocessing=ColumnTransformer([
#     ("cat",OneHotEncoder(handle_unknown="ignore"),cat_cloumns_name),
#     ("simplelmputer",SimpleImputer(strategy="median"),X.columns),
#     ("stand",StandardScaler(),X.columns)
# ])
#逻辑回归
pipeline= Pipeline([
    ("preprocessing", preprocessing),
    ("model", LogisticRegression())
])
pipeline.fit(X, y)

In [69]:
#模型性能的评估
#逻辑回归
from sklearn.metrics import accuracy_score #分类精度
from sklearn.metrics import f1_score
train_acc=accuracy_score(y,pipeline.predict(X))  #分类精度

train_f1=f1_score(y,pipeline.predict(X))#  f1 分数
train_f1








0.7717717717717718

In [80]:
#决策树
from sklearn.tree import DecisionTreeClassifier
pipeline_decisiontree=make_pipeline(
    preprocessing,
    DecisionTreeClassifier(),


)
pipeline_decisiontree.fit(X,y)




In [82]:
train_acc_decisiontree=accuracy_score(y,pipeline_decisiontree.predict(X))
train_acc_decisiontree

train_f1_decisiontree=f1_score(y,pipeline_decisiontree.predict(X))
train_f1_decisiontree

0.9822485207100592

In [83]:
#knn
from sklearn.neighbors import KNeighborsClassifier
pipeline_knn=make_pipeline(
    preprocessing,
    KNeighborsClassifier(),
)

pipeline_knn.fit(X,y)

In [85]:
pipeline_knn_acc=accuracy_score(y,pipeline_knn.predict(X))
pipeline_knn_acc

pipeline_knn_f1=f1_score(y,pipeline_knn.predict(X))
pipeline_knn_f1

0.8102409638554215

In [None]:
# 1. 精确率是什么，评估分类的性能为什么不能只用精确率
#精确率是 正确预测为正的样本/预测为正的样本（包括正确预测和错误预测）
#容易漏检其他重要的正类，  可能选择的不是最优模型   应该结合其他值一起估计
# 2. 简述下混淆矩阵是什么
#是一个n*n的表格 n是类别数  对比真实标签和预测标签之间的关系
# 3. 简述下各个性能指标的意思： 准确率，召回率，F1分数，假阳性，PR曲线，ROC曲线，AUC分数
#准确率：预测正确的占总样本的比列
#召回率：真正的对的被预测对的概率
#F1分数 ： 精确率和召回率的调和平均数
#假阳性 ： 预测为正类实际上是负类
#PR曲线：不同阈值下 精确率和召回率的曲线
#roc曲线：  不同阈值下  真阳率和假阳率的关系
#auc分数： 是roc曲线的曲下面积  =1  完美区分  =0.5 没有区分能力
# 4. 简述下准确率-召回率权衡
#精确率和召回率是评估模型的两个重要指标，他们存在此消彼长的关系
# 5. 如何用二元分类器 去解决多元分类的问题
#通过多种策略把二元分类扩展为多元分类
# 6. 什么是 多标签-多分类问题？
#单个样本同时属于多个分类