# テキストのクリーニング

In [2]:

text_data = ["   Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "    Today Is The night. By Jarek Prakash   "]

# ホワイトスペースがあるテキスト
strip_white=[string.strip() for string in text_data]

strip_white

['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash']

In [6]:
# ピリオドの除去
remove_periods=[string.replace('.','') for string in strip_white]

remove_periods

['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash']

In [10]:
# 大文字に変換
def captilizer(string:str)->str:
    return string.upper()

upper=[captilizer(string) for string in remove_periods]

upper

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [15]:
# 正規表現で変換
import re

def replace_letters_to_X(string : str)->str:
    return re.sub(r'[a-zA-Z]','X',string)
    
[replace_letters_to_X(string) for string in upper]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

# 句読点を取り除きたい

In [33]:
import unicodedata
import sys

text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

# 句読点をすべて網羅するdicをつくる
# keyには句読点に所属する文字のcodepoint,valueにはNoneが入っている
punc=dict.fromkeys(i for i in range(sys.maxunicode)
                                    if unicodedata.category(chr(i)).startswith('P') )

[string.translate(punc) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

# トークン化


## 英語の場合

In [52]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hiroaki/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [54]:
from nltk.tokenize import sent_tokenize
string = "The science of today is the technology of tomorrow. Tomorrow is today."
sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

## 日本語の場合

In [66]:
from janome.tokenizer import  Tokenizer

japanese='こんにちはごきげんようさようなら'
tokenizer=Tokenizer()
tokens=tokenizer.tokenize(japanese)

for token in tokens:
    print(token)

こんにちは	感動詞,*,*,*,*,*,こんにちは,コンニチハ,コンニチワ
ごきげんよう	感動詞,*,*,*,*,*,ごきげんよう,ゴキゲンヨウ,ゴキゲンヨー
さようなら	感動詞,*,*,*,*,*,さようなら,サヨウナラ,サヨーナラ


# Bag of Wordsによるベクトル化

In [95]:
# 特徴量ベクトル化
import  numpy as np
from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

count=CountVectorizer()

bag_of_words=count.fit_transform(text_data)

type(bag_of_words)

scipy.sparse.csr.csr_matrix

In [96]:
bag_of_words.toarray()

array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)

In [97]:
import pandas as pd
col=count.get_feature_names()

pd.DataFrame(bag_of_words.toarray(),columns=col)

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0,0,0,2,0,0,1,0
1,0,1,0,0,0,1,0,1
2,1,0,1,0,1,0,0,0


# wordに重みをつけるて特徴ベクトル化(tf-idf変換)

頻度が高い単語の寄与が少なくなる

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf=TfidfVectorizer()

feature_matrix=tfidf.fit_transform(text_data)

pd.DataFrame(feature_matrix.toarray(),columns=tfidf.vocabulary_)

Unnamed: 0,love,brazil,sweden,is,best,germany,beats,both
0,0.0,0.0,0.0,0.894427,0.0,0.0,0.447214,0.0
1,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735
2,0.57735,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0
