In [1]:
def make_feature_file(fname):
    with open(fname) as file:
        import re
        from nltk.corpus import stopwords
        import snowballstemmer
        from collections import Counter
        
        features = []
        all_words = []
        
        for line in file:
            line = line.rstrip('\n')
            title = line.split('\t')[1]
            
            # 文字種の統一
            title = title.lower()
            
            # 数字の置き換え -> 除去
            title = re.sub(r'[0-9]+', '', title)
            
            # '-'を' 'に変換
            title = title.replace('-', ' ')
            
            words = title.split()
            
            # ストップワードの除去
            stop_words = stopwords.words('english')
            words2 = [word for word in words if word not in stop_words]
            words = words2
            
            # ステミング処理
            stemmer = snowballstemmer.stemmer('english')
            words2 = [stemmer.stemWord(word) for word in words]
            words = words2
            
            # 記号の除去
            words2 = [word for word in words if word.islower()]
            words = words2
            
            for word in words:
                all_words.append(word)
                
        word_counts = Counter(all_words).most_common()
        for word_count in word_counts:
            # 出現回数が1回以下の単語の除去
            if word_count[1] > 1:
                features.append(word_count[0])
                
        fname2 = '{0}.feature.txt'.format(fname.replace('.txt', ''))
        with open(fname2, 'w') as file2:
            for feature in features:
                file2.write('{0}\n'.format(feature))

In [2]:
make_feature_file('train.txt')
make_feature_file('valid.txt')
make_feature_file('test.txt')