In [1]:
#必要なライブラリのインポート
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

In [2]:
#データの読み込み
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [3]:
#先頭五行を確かめる
train.head()

Unnamed: 0,id,description,jobflag
0,0,"Executes and writes portions of testing plans,...",2
1,1,Maintain Network Performance by assisting with...,3
2,2,Supports the regional compliance manager with ...,4
3,3,Keep up to date with local and national busine...,1
4,4,Assist with Service Organization Control (SOC)...,4


In [4]:
#trainのカテゴリごとの数を調べる
train["jobflag"].value_counts()

3    1376
1     624
4     583
2     348
Name: jobflag, dtype: int64

In [5]:
#訓練データとテストデータに分ける
X_train = pd.DataFrame(train["description"])
Y_train = pd.DataFrame(train["jobflag"])
X_test = pd.DataFrame(test["description"])

In [6]:
#単語数でjobflagを識別する
#使用するモデルはLinearSVC

In [6]:
tf_vec = TfidfVectorizer() #tf-idf
#学習させる
tf_vec.fit(X_train["description"])

TfidfVectorizer()

In [9]:
#transform()を使用してトレーニングデータをベクトル化
X_train_tfidf = tf_vec.fit_transform(X_train["description"])

In [11]:
# LinearSVCの取りうるモデルパラメータを設定
# LinearSVCの取りうるパラメータの組み合わせは以下の三つ
Standard = LinearSVC(penalty='l2', loss='hinge', dual=True, tol=1e-3)
LossL2 = LinearSVC(penalty='l1', loss='squared_hinge', dual=False, tol=1e-3)
PenaltyL1 = LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=1e-3)

#この中から最適な組み合わせをグリッドサーチで探す
model_set = [Standard, LossL2, PenaltyL1]

# グリッドサーチしたいパラメータCの値域をnp.logspaceで探す
tuned_parameters = [{'C': np.logspace(-1, 2, 30)}]



# 各モデルを一つずつCに関してグリッドサーチを行う
# 各モデルを一つずつCに関してグリッドサーチを行う
for model in model_set:
    # 5-fold Cross-Validationで評価指標はprecision(適合率)を選択  
    grid_search = GridSearchCV(model, tuned_parameters, cv=10,scoring="accuracy")
    # 設定したパラメータで学習しつつ検証を行う
    grid_search.fit(X_train_tfidf, Y_train)


print("スコアがベストな時のパラメータ:{}".format(grid_search.best_params_))
print("スコアがベストな時のcross-validation score:{:.2f}".format(grid_search.best_score_))
print("スコアがベストな時の設定条件:{}".format(grid_search.best_estimator_))

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


スコアがベストな時のパラメータ:{'C': 0.41753189365604015}
スコアがベストな時のcross-validation score:0.66
スコアがベストな時の設定条件:LinearSVC(C=0.41753189365604015, tol=0.001)


In [12]:
#LinearSVCモデルを構築
model = LinearSVC(C=0.41753189365604015, class_weight=None, dual=True,
     fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
     max_iter=1000, multi_class='ovr', penalty='l2', random_state=None,
     tol=0.001, verbose=0)
model.fit(X_train_tfidf, Y_train["jobflag"])

LinearSVC(C=0.41753189365604015, tol=0.001)

In [13]:
#モデルの評価をする(どれくらい正確か確認する)
print("Train accuracy = %.3f" % model.score(X_train_tfidf, Y_train))

Train accuracy = 0.917


In [14]:
#作ったモデルを評価用データに適用する
input_test = tf_vec.transform(test["description"])

pd.DataFrame(input_test.toarray(), columns=tf_vec.get_feature_names())

Unnamed: 0,000,10,11,20,2003,2008,2012,2013,2016,24,...,yarn,year,yearly,yet,yield,you,younger,your,zeiss,zookeeper
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199364,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1738,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1740,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


In [15]:
#予測を実行する
model.predict(input_test)

array([4, 3, 3, ..., 1, 3, 3], dtype=int64)

In [16]:
#test用データにjobflagというカラムを足し、jobflagを一つ一つ予測する
test["jobflag"] = model.predict(input_test)

In [17]:
#出力
test

Unnamed: 0,id,description,jobflag
0,2931,Work with the implementation teams,4
1,2932,"Set technology direction, strategy, policies, ...",3
2,2933,Experience with Orchestration and Automation p...,3
3,2934,"Apply your expertise in quantitative analysis,...",1
4,2935,Provide regular maintenance for knowledge rete...,3
...,...,...,...
1738,4669,Conduct data analysis to make business recomme...,1
1739,4670,Obtain shipping bills of lading and necessary ...,3
1740,4671,Connect and collaborate with subject matter ex...,1
1741,4672,Ensures continuity of business execution i.e. ...,3


In [18]:
#それぞれのカテゴリが何個あるか求める
test["jobflag"].value_counts()

3    992
1    389
4    278
2     84
Name: jobflag, dtype: int64

In [34]:
#作成した予測データをexcelに落とし込む
test.to_csv("tf-idf_SVM(CV).csv",index =False, encoding = "utf_8_sig")

In [41]:
ls

 ドライブ C のボリューム ラベルは Windows です
 ボリューム シリアル番号は 94B8-00F0 です

 C:\Users\keisu のディレクトリ

2021/05/12  21:47    <DIR>          .
2021/05/12  21:47    <DIR>          ..
2021/05/12  21:49    <DIR>          .conda
2021/05/12  18:40                43 .condarc
2021/05/12  21:46    <DIR>          .ipynb_checkpoints
2021/05/12  19:49    <DIR>          .ipython
2021/01/29  19:30    <DIR>          3D Objects
2021/05/12  18:38    <DIR>          anaconda3
2021/05/06  01:09    <DIR>          ansel
2021/01/29  19:30    <DIR>          Contacts
2021/05/12  21:44    <DIR>          Downloads
2021/01/29  19:30    <DIR>          Favorites
2021/01/29  19:30    <DIR>          Links
2021/01/29  19:30    <DIR>          Music
2021/05/07  15:15    <DIR>          OneDrive
2021/02/03  13:57    <DIR>          Saved Games
2021/01/29  19:31    <DIR>          Searches
2021/05/12  20:23           218,596 test.csv
2021/05/12  20:48           229,698 tf-idf_SVM
2021/05/12  20:52           222,095 tf-idf_SVM(CV).csv
2021/05/1