In [None]:
#参考サイト
#https://qiita.com/ground0state/items/155b77f4c07e1a509a14
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.metrics import accuracy_score
#カンマ区切りはread_csv()、タブ区切りはread_tabel()を使用

def load_data(filename):
    df=pd.read_table(filename,
                                   header=None,
                                   sep="\t",
                                   encoding="UTF-8")
    X=df.drop(df.columns[[len(df.columns)-1]], axis=1)#特徴量の取得
    Y=df[len(df.columns)-1]#ラベルの取得
    return X,Y

df = pd.read_table("news_debug.txt",
                                   header=None,
                                   names=("ID","TITLE","URL","PUBLISHER","CATEGORY","STORY","HOSTNAME","TIME"),
                                   sep="\t",
                                   encoding="UTF-8")
instance=df.query("PUBLISHER in ['Reuters','Huffington Post','Businessweek','Contactmusic.com','Daily Mail']")
instance_loc=instance.loc[:,["CATEGORY","TITLE"]]
category_list=instance_loc.CATEGORY.values.tolist()
title_list=instance_loc.TITLE.values.tolist()


for i,title in enumerate(title_list):
    title=title.lower()
    title=re.sub('[!"#$%&\'\\\\()*+,-./:;<=>?@[\\]^_`{|}~「」〔〕“”〈〉『』【】＆＊・（）＄＃＠。、？！｀＋￥％]',"",title)#記号類を削除
    title=re.sub(" [0-9]+"," 0",title)#数字を0に置き換え
    title=re.sub(" [0-9]+(.+?) "," \\1 ",title)#3-appleのような表現をappleに置き換え
    title=re.sub("million|thouzand|billion|trillion|quadrillion","0",title)#位を示す数詞を0に置き換え
    title_list[i]=title
        #word=re.sub("[0-9]+","0",word)
#print(title_list)
#print(len(title_list))

#カテゴリ名を数値に変換
str_to_int={"b":0,"t":1,"e":2,"m":3}
for i,category in enumerate(category_list):
    category_list[i]=str_to_int[category]  
#print(category_list)

#データ量を軽くするため、出現頻度の低い単語を無視
vec_tfidf=TfidfVectorizer(min_df=0.01)
X=vec_tfidf.fit_transform(title_list)



print(type(X))
df=pd.DataFrame(X.toarray(), columns=vec_tfidf.get_feature_names())#pandas.df型に変換
df["CATEGORY"]=category_list#カテゴリ名を末尾に追加
train,valid=train_test_split(df,test_size=0.20)#訓練データと検証データに分割
valid,test=train_test_split(valid,test_size=0.25)#検証データを、検証データと評価データに再分割


#ファイル出力
train.to_csv("train.feature_debug.txt", sep="\t",header=False,index=False)
valid.to_csv("valid.feature_debug.txt", sep="\t",header=False,index=False)
test.to_csv("test,feature_debug.txt", sep="\t",header=False,index=False)

train_df=pd.read_table("train.feature_debug.txt",
                                   header=None,
                                   sep="\t",
                                   encoding="UTF-8")

X_train=train_df.drop(train_df.columns[[len(df.columns)-1]], axis=1)
Y_train=train_df[len(train_df.columns)-1]
print(X_train)
print(Y_train)
lr = LogisticRegression(max_iter=1000)#インスタンスを作成、デフォルトで収束しなかったため1000とした
lr.fit(X_train, Y_train)#重みを学習

#モデルをシリアライズして保存
filename="my_lr_debug.model"
pickle.dump(lr, open(filename, 'wb'))

lr=pickle.load(open("my_lr_debug.model", 'rb'))

X_train,Y_train=load_data("train.feature_debug.txt")
X_test,Y_test=load_data("test.feature_debug.txt")
pred_train=lr.predict(X_train)
pred_test=lr.predict(X_test)
#print(pred_train)
#print(Y_train)
#print(pred_test)
#print(Y_test)
print(accuracy_score(Y_train, pred_train))#学習データ上での正解率を表示
print(accuracy_score(Y_test, pred_test))#評価データ上での正解率を表示
