In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocessing(text):
    table = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    text = text.translate(table)  # 記号をスペースに置換
    text = text.lower()  # 小文字化
    text = re.sub('[0-9]+', '0', text)  # 数字列を0に置
    return text

# データの読込
df = pd.read_csv('newsCorpora.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])

# 事例数の確認
#print('学習データ')
#print(train['CATEGORY'].value_counts())
#print('検証データ')
#print(valid['CATEGORY'].value_counts())
#print('評価データ')
#print(test['CATEGORY'].value_counts())

# データの再結合
df = pd.concat([train, valid, test], axis=0)
df.reset_index(drop=True, inplace=True)  # indexを振りなおす

# 前処理の実施
df['TITLE'] = df['TITLE'].map(lambda x: preprocessing(x))

#print(df.head())

train_valid = df[:len(train) + len(valid)]
test = df[len(train) + len(valid):]

# TfidfVectorizer
vec_tfidf = TfidfVectorizer(min_df=0.01, ngram_range=(1, 2))  # ngram_rangeでTF-IDFを計算する単語の長さを指定

# ベクトル化
X_train_valid = vec_tfidf.fit_transform(train_valid['TITLE'])  # testの情報は使わない
X_test = vec_tfidf.transform(test['TITLE'])

# ベクトルをデータフレームに変換
X_train_valid = pd.DataFrame(X_train_valid.toarray(), columns=vec_tfidf.get_feature_names())
X_test = pd.DataFrame(X_test.toarray(), columns=vec_tfidf.get_feature_names())

# データの分割
X_train = X_train_valid[:len(train)]
X_valid = X_train_valid[len(train):]

# データの保存
X_train.to_csv('./X_train.txt', sep='\t', index=False,header=False)
X_valid.to_csv('./X_valid.txt', sep='\t', index=False,header=False)
X_test.to_csv('./X_test.txt', sep='\t', index=False,header=False)

print(X_train.head())

   about    after  all       and  are        as   at  bank   be  before  ...  \
0    0.0  0.00000  0.0  0.000000  0.0  0.000000  0.0   0.0  0.0     0.0  ...   
1    0.0  0.31646  0.0  0.000000  0.0  0.000000  0.0   0.0  0.0     0.0  ...   
2    0.0  0.00000  0.0  0.000000  0.0  0.000000  0.0   0.0  0.0     0.0  ...   
3    0.0  0.00000  0.0  0.000000  0.0  0.000000  0.0   0.0  0.0     0.0  ...   
4    0.0  0.00000  0.0  0.463099  0.0  0.439275  0.0   0.0  0.0     0.0  ...   

   wall  wall st  was  week  west  who  will  with  year  you  
0   0.0      0.0  0.0   0.0   0.0  0.0   0.0   0.0   0.0  0.0  
1   0.0      0.0  0.0   0.0   0.0  0.0   0.0   0.0   0.0  0.0  
2   0.0      0.0  0.0   0.0   0.0  0.0   0.0   0.0   0.0  0.0  
3   0.0      0.0  0.0   0.0   0.0  0.0   0.0   0.0   0.0  0.0  
4   0.0      0.0  0.0   0.0   0.0  0.0   0.0   0.0   0.0  0.0  

[5 rows x 111 columns]
