# Линейная регрессия

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack

In [5]:
data = pd.read_csv('data/salary-train.csv')
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,International Sales Manager London ****k ****...,London,permanent,33000
1,An ideal opportunity for an individual that ha...,London,permanent,50000
2,Online Content and Brand Manager// Luxury Reta...,South East London,permanent,40000
3,A great local marketleader is seeking a perman...,Dereham,permanent,22500
4,Registered Nurse / RGN Nursing Home for Young...,Sutton Coldfield,,20355


In [11]:
data['FullDescription'] = data['FullDescription'].map(lambda x: x.lower())
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london ****k ****...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager// luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse / rgn nursing home for young...,Sutton Coldfield,,20355


In [14]:
data['FullDescription'] = data['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
data.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,international sales manager london k ...,London,permanent,33000
1,an ideal opportunity for an individual that ha...,London,permanent,50000
2,online content and brand manager luxury reta...,South East London,permanent,40000
3,a great local marketleader is seeking a perman...,Dereham,permanent,22500
4,registered nurse rgn nursing home for young...,Sutton Coldfield,,20355


In [17]:
tfidf = TfidfVectorizer(min_df=5)
data_transformed = tfidf.fit_transform(data['FullDescription'])

In [21]:
data['LocationNormalized'].fillna('nan', inplace=True)
data['ContractTime'].fillna('nan', inplace=True)

In [24]:
enc = DictVectorizer()
X_train_categ = enc.fit_transform(data[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [29]:
X_train = hstack([data_transformed, X_train_categ])

In [31]:
y_train = data['SalaryNormalized']
clf = Ridge(alpha=1, random_state = 241)
clf.fit(X_train, y_train)

Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [33]:
data_test = pd.read_csv('data/salary-test-mini.csv')
data_test.head()

Unnamed: 0,FullDescription,LocationNormalized,ContractTime,SalaryNormalized
0,We currently have a vacancy for an HR Project ...,Milton Keynes,contract,
1,A Web developer opportunity has arisen with an...,Manchester,permanent,


In [35]:
data_test['FullDescription'] = data_test['FullDescription'].map(lambda text: text.lower())
data_test['FullDescription'] = data_test['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex = True)
data_test['LocationNormalized'].fillna('nan', inplace=True)
data_test['ContractTime'].fillna('nan', inplace=True)

In [38]:
X_test = tfidf.transform(data_test['FullDescription'])
X_test_one_hot = enc.transform(data_test[['LocationNormalized', 'ContractTime']].to_dict('records'))
X_test = hstack([X_test, X_test_one_hot])

In [42]:
y_test = clf.predict(X_test)
print(y_test)

[56555.61500155 37188.32442618]


# Составление фондового индекса

In [64]:
from sklearn.decomposition import PCA
import numpy as np

In [45]:
data = pd.read_csv('data/close_prices.csv')
data.head()

Unnamed: 0,date,AXP,BA,CAT,CSCO,CVX,DD,DIS,GE,GS,...,PFE,PG,T,TRV,UNH,UTX,V,VZ,WMT,XOM
0,2013-09-23,76.440002,117.510002,85.029999,24.27,125.519997,59.409999,64.75,24.280001,165.25,...,28.799999,79.279999,34.220001,86.379997,71.82,109.419998,196.240005,47.98,76.419998,87.75
1,2013-09-24,76.07,119.0,85.110001,24.139999,124.489998,59.319997,64.32,24.32,162.970001,...,28.709999,78.620003,34.09,85.870003,72.32,110.0,193.339996,47.27,75.75,87.360001
2,2013-09-25,75.989998,118.510002,84.5,24.43,124.07,59.319997,64.449997,24.23,162.309998,...,28.49,77.720001,34.049999,85.980003,71.980003,109.260002,191.559998,46.950001,74.650002,87.139999
3,2013-09-26,76.32,119.379997,84.199997,23.77,123.489998,59.509996,65.239998,24.25,162.289993,...,28.52,78.050003,34.23,85.830002,72.160004,109.660004,193.559998,47.669998,74.620003,87.07
4,2013-09-27,75.889999,118.739998,83.800003,23.33,122.639999,59.009995,65.190002,24.049999,159.850006,...,28.879999,77.209999,33.98,85.410004,71.989998,109.360001,193.050003,47.0,74.360001,86.900002


In [51]:
X_train = data.loc[:, 'AXP':]
pca = PCA(n_components=10)
pca.fit(X_train)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [53]:
pca.explained_variance_ratio_ 

array([0.73897118, 0.11007169, 0.04995088, 0.0287492 , 0.02215448,
       0.01931577, 0.00674853, 0.00614091, 0.00320594, 0.00305611])

In [55]:
sum_ = 0
n = 0
for x in pca.explained_variance_ratio_:
    if sum_ < 0.9:
        sum_ += x 
        n += 1
    else: 
        break
n

4

In [68]:
X_train_trransformed = pd.DataFrame(pca.transform(X_train))
X_train_trransformed[0]

0     -50.902404
1     -52.846909
2     -54.614439
3     -52.600566
4     -52.370123
5     -54.653412
6     -52.812575
7     -53.651146
8     -56.692727
9     -54.402655
10    -58.331645
11    -61.716265
12    -61.319400
13    -53.792524
14    -50.892634
15    -49.637408
16    -52.387615
17    -47.478543
18    -43.560335
19    -42.559280
20    -42.450271
21    -42.271248
22    -43.252728
23    -40.059345
24    -38.497263
25    -39.064923
26    -38.393044
27    -38.400558
28    -42.386170
29    -39.830574
         ...    
344    67.607802
345    74.206256
346    71.305000
347    68.041973
348    70.268799
349    71.435269
350    76.342232
351    74.859336
352    76.322836
353    75.621858
354    75.403574
355    80.946849
356    82.097270
357    83.045140
358    83.048730
359    84.251636
360    81.085307
361    88.953955
362    85.514365
363    83.441017
364    84.579747
365    77.973512
366    81.468090
367    72.450565
368    72.456036
369    81.290980
370    77.903534
371    83.7981

In [65]:
data_test = pd.read_csv('data/djia_index.csv')
data_test.head()

Unnamed: 0,date,^DJI
0,2013-09-23,15401.379883
1,2013-09-24,15334.589844
2,2013-09-25,15273.259766
3,2013-09-26,15328.299805
4,2013-09-27,15258.240234


In [71]:
corr = np.corrcoef(X_train_trransformed[0], data_test['^DJI'])
corr

array([[1.        , 0.90965222],
       [0.90965222, 1.        ]])

In [82]:
X_train.columns[pd.Series(pca.components_[0]).sort_values(ascending=False).head(1).index[0]]

'V'