In [16]:
import pandas as pd

columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "year", "origin", "car name"]
cars = pd.read_table(
    "E:\Workspace\jupyter_notebook\\notebook_idata_lesson01\scikit-learn\data\\auto-mpg.data", delim_whitespace=True,
    names=columns)
cars[:5]


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [17]:
# get_dummies，把一列数据分成多个维度
dummy_cylinders = pd.get_dummies(cars["cylinders"], prefix="cyl")
# pandas连接数据
cars = pd.concat([cars, dummy_cylinders], axis=1)
cars = cars.drop("cylinders", axis=1)
cars[:5]


Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,year,origin,car name,cyl_3,cyl_4,cyl_5,cyl_6,cyl_8
0,18.0,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,0,0,0,0,1
1,15.0,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,0,0,0,0,1
2,18.0,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,0,0,0,0,1
3,16.0,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,0,0,0,0,1
4,17.0,302.0,140.0,3449.0,10.5,70,1,ford torino,0,0,0,0,1


In [18]:
# get_dummies
dummy_years = pd.get_dummies(cars["year"], prefix="year")
cars = pd.concat([cars, dummy_years], axis=1)
cars = cars.drop("year", axis=1)
cars[:5]


Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin,car name,cyl_3,cyl_4,cyl_5,...,year_73,year_74,year_75,year_76,year_77,year_78,year_79,year_80,year_81,year_82
0,18.0,307.0,130.0,3504.0,12.0,1,chevrolet chevelle malibu,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15.0,350.0,165.0,3693.0,11.5,1,buick skylark 320,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,18.0,318.0,150.0,3436.0,11.0,1,plymouth satellite,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,16.0,304.0,150.0,3433.0,12.0,1,amc rebel sst,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17.0,302.0,140.0,3449.0,10.5,1,ford torino,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
import numpy as np

shuffled_rows = np.random.permutation(cars.index)
shuffled_cars = cars.iloc[shuffled_rows]
highest_train_now = int(cars.shape[0] * .70)
# 分成训练集和测试集
train = shuffled_cars.iloc[0:highest_train_now]
test = shuffled_cars.iloc[highest_train_now:]


In [21]:
from sklearn.linear_model import LogisticRegression

unique_origins = cars["origin"].unique()
unique_origins.sort()

models = {}
# 就用之前get_dummies处理过的几个字段
features = [c for c in train.columns if c.startswith("cyl") or c.startswith("year")]

# 多个类别分类，转化为多个二分类分类
for origin in unique_origins:
    model = LogisticRegression()

    x_train = train[features]
    y_train = train["origin"] == origin

    model.fit(x_train, y_train)
    models[origin] = model


In [22]:
testing_probs = pd.DataFrame(columns=unique_origins)

for origin in unique_origins:
    x_test = test[features]

    testing_probs[origin] = models[origin].predict_proba(x_test)[:, 1]

# 结果就是1、2、3，3个类别分别的概率是多少
print(testing_probs)


            1         2         3
0    0.614745  0.213888  0.184206
1    0.486117  0.124116  0.415901
2    0.295561  0.282162  0.421629
3    0.850729  0.093195  0.063776
4    0.486117  0.124116  0.415901
5    0.279162  0.434978  0.274985
6    0.287484  0.400650  0.308688
7    0.835680  0.075298  0.088443
8    0.287484  0.400650  0.308688
9    0.287484  0.400650  0.308688
10   0.295561  0.282162  0.421629
11   0.753284  0.225694  0.087038
12   0.324228  0.447743  0.215828
13   0.204385  0.492616  0.316356
14   0.287484  0.400650  0.308688
15   0.919430  0.041827  0.068821
16   0.960474  0.033093  0.024432
17   0.486117  0.124116  0.415901
18   0.287484  0.400650  0.308688
19   0.845449  0.105831  0.054697
20   0.350253  0.297200  0.340813
21   0.287484  0.400650  0.308688
22   0.962001  0.028862  0.028639
23   0.486117  0.124116  0.415901
24   0.446868  0.221147  0.326354
25   0.204385  0.492616  0.316356
26   0.946368  0.044420  0.024221
27   0.350253  0.297200  0.340813
28   0.324228 