In [41]:
import pandas as pd
import quandl,math
import numpy as np
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression

In [42]:
df = quandl.get("WIKI/GOOGL")

# 使用head查看前几行数据（默认是前5行）
print(df.head())

              Open    High     Low    Close      Volume  Ex-Dividend  \
Date                                                                   
2004-08-19  100.01  104.06   95.96  100.335  44659000.0          0.0   
2004-08-20  101.01  109.08  100.50  108.310  22834300.0          0.0   
2004-08-23  110.76  113.48  109.05  109.400  18256100.0          0.0   
2004-08-24  111.24  111.60  103.57  104.870  15247300.0          0.0   
2004-08-25  104.76  108.00  103.88  106.000   9188600.0          0.0   

            Split Ratio  Adj. Open  Adj. High   Adj. Low  Adj. Close  \
Date                                                                   
2004-08-19          1.0  50.159839  52.191109  48.128568   50.322842   
2004-08-20          1.0  50.661387  54.708881  50.405597   54.322689   
2004-08-23          1.0  55.551482  56.915693  54.693835   54.869377   
2004-08-24          1.0  55.792225  55.972783  51.945350   52.597363   
2004-08-25          1.0  52.542193  54.167209  52.100830   53.1

In [43]:
df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]
print(df.head())

            Adj. Open  Adj. High   Adj. Low  Adj. Close  Adj. Volume
Date                                                                
2004-08-19  50.159839  52.191109  48.128568   50.322842   44659000.0
2004-08-20  50.661387  54.708881  50.405597   54.322689   22834300.0
2004-08-23  55.551482  56.915693  54.693835   54.869377   18256100.0
2004-08-24  55.792225  55.972783  51.945350   52.597363   15247300.0
2004-08-25  52.542193  54.167209  52.100830   53.164113    9188600.0


In [44]:
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low'])/df['Adj. Close']*100.0 # 波动的粗糙度量
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open'])/df['Adj. Open']*100.0 # 每日百分比变化

# 定义新的DataFrame
df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
print(df.head())


            Adj. Close    HL_PCT  PCT_change  Adj. Volume
Date                                                     
2004-08-19   50.322842  8.072956    0.324968   44659000.0
2004-08-20   54.322689  7.921706    7.227007   22834300.0
2004-08-23   54.869377  4.049360   -1.227880   18256100.0
2004-08-24   52.597363  7.657099   -5.726357   15247300.0
2004-08-25   53.164113  3.886792    1.183658    9188600.0


In [45]:
#print(len(df))
forecast_col = 'Adj. Close'
#当数据中存在NAN缺失值时，可以使用其他数据代替，主要用到了DataFrame.fillna()方法 inplace=Trues时，会在原数据中修改
df.fillna(value=-99999,inplace=True)
#预测数据集长度的10% 例如：数据是 100 天的股票价格，我们需要能够预测未来十天的价格
forecast_out = int(math.ceil(0.01 * len(df))) #math.ceil() 向上取整
print(forecast_out)

35


In [46]:
# pandas shift() https://www.cnblogs.com/iamxyq/p/6283334.html
df['label'] = df[forecast_col].shift(-forecast_out)
df.dropna(inplace=True)
print(df.head())

            Adj. Close    HL_PCT  PCT_change  Adj. Volume      label
Date                                                                
2004-08-19   50.322842  8.072956    0.324968   44659000.0  69.078238
2004-08-20   54.322689  7.921706    7.227007   22834300.0  67.839414
2004-08-23   54.869377  4.049360   -1.227880   18256100.0  68.912727
2004-08-24   52.597363  7.657099   -5.726357   15247300.0  70.668146
2004-08-25   53.164113  3.886792    1.183658    9188600.0  71.219849


上面代码难理解label列如何生成以及有什么用？
实际上这一列的第i个元素都是Adj. Close列的第i + forecast_out个元素。我想尝试用简单文字描述：这列的每个数据是真实统计中的未来forecast_out天的收盘价。利用这一列的数据作为线性回归模型的监督标准，让模型学习出规律，然后我们才能用之预测结果。
https://www.cnblogs.com/developerdaily/p/8440622.html

## 训练和测试

In [47]:
# drop函数默认删除行，列需要加axis = 1 不改变原有的df中的数据
X = np.array(df.drop(['label'], 1))
print(X)
y = np.array(df['label'])
print(y)

[[  5.03228418e+01   8.07295560e+00   3.24967503e-01   4.46590000e+07]
 [  5.43226889e+01   7.92170621e+00   7.22700723e+00   2.28343000e+07]
 [  5.48693765e+01   4.04936015e+00  -1.22788010e+00   1.82561000e+07]
 ..., 
 [  1.18159000e+03   1.53098791e+00   4.76194525e-01   2.77496700e+06]
 [  1.11920000e+03   1.79860615e+00  -7.29098295e-01   5.79888000e+06]
 [  1.06876000e+03   5.45024140e+00  -2.89384977e+00   3.74246900e+06]]
[   69.0782379     67.83941377    68.91272699 ...,  1026.55        1054.09
  1006.94      ]


In [49]:
X = preprocessing.scale(X)

In [50]:
# train_test_split https://blog.csdn.net/xiaodongxiexie/article/details/71915259
# 对数据集进行快速打乱（分为训练集和测试集）
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

In [52]:
clf = svm.SVR()
# fit(X,y)监督学习方法
clf.fit(X_train, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [53]:
# 对测试集测试
confidence = clf.score(X_test, y_test)
print(confidence)

0.791049923391


In [54]:
# 使用另一种分类器 传入-1，算法会使用所有可用的线程。
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print(confidence)

0.975056036085




In [55]:
for k in ['linear','poly','rbf','sigmoid']:
    clf = svm.SVR(kernel=k)
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print(k,confidence)

linear 0.973198265372
poly 0.606154930296
rbf 0.791049923391
sigmoid 0.884948961733


In [None]:
# 现阶段的代码 后续优化以及预测等参考 regression_predict_change.ipynb
import quandl, math
import numpy as np
import pandas as pd
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression

df = quandl.get("WIKI/GOOGL")
df = df[['Adj. Open',  'Adj. High',  'Adj. Low',  'Adj. Close', 'Adj. Volume']]
df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) / df['Adj. Close'] * 100.0
df['PCT_change'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0

df = df[['Adj. Close', 'HL_PCT', 'PCT_change', 'Adj. Volume']]
forecast_col = 'Adj. Close'
df.fillna(value=-99999, inplace=True)
forecast_out = int(math.ceil(0.01 * len(df)))
df['label'] = df[forecast_col].shift(-forecast_out)

X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X = X[:-forecast_out]
df.dropna(inplace=True)
y = np.array(df['label'])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print(confidence)