In [1]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('D:/学习/集思项目/final project/train.csv')
data.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [3]:
data = data.head(10000)

In [4]:
data['text'] = data['TITLE'] + data['ABSTRACT']
X = data[['text']]
Y = data[['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']]

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

X['text'] = X['text'].map(lambda com : clean_text(com))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
# 转换数据格式
def DataFrame2List(X):
    X1 = np.array(X)
    X2 = X.values.flatten()
    X3 = X2.tolist()
    return X3

X = DataFrame2List(X)

In [7]:
def remove_stopwords(text,stop_words):
    text_new = " ".join([i for i in text if i not in stop_words])
    return text_new

In [8]:
# 去除停用词，标点等
from nltk.corpus import stopwords
remove = str.maketrans('','',string.punctuation)
X_without_punctuation = [text.translate(remove) for text in X]
stop_words = stopwords.words('english')
X_remove_stopwords = [remove_stopwords(text.split(), stop_words) for text in X_without_punctuation]

In [9]:
# 词形还原
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [10]:
def Lemmatization(sentence):
    
    tokens = word_tokenize(sentence) 
    tagged_sent = pos_tag(tokens)     
 
    wnl = WordNetLemmatizer()
    lemmas_sent = []
    for tag in tagged_sent:
        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
        word = wnl.lemmatize(tag[0], pos=wordnet_pos)
        if len(word) == 1:
            continue
        else:
            lemmas_sent.append(word)
        
    return lemmas_sent

In [11]:
X_tokens = [Lemmatization(text) for text in X_remove_stopwords]

In [12]:
from collections import Counter,defaultdict
import math

def get_tf(corpus):
    return [Counter(doc) for doc in corpus]

def get_idf(tf_dict):
    idf = defaultdict(int)
    for doc in tf_dict:
        for word in doc:
            idf[word] += 1
    for word in idf:
        idf[word] = math.log(10000/(idf[word]+1))#idf的公式
    return idf
 
def get_tfidf(doc_id,corpus):
    
    tf = get_tf(corpus)
    idf = get_idf(tf)
    id_tf= tf[doc_id]

    for word in idf:
        idf[word] = idf[word] * id_tf[word] / len(corpus[doc_id])
        
    return idf

In [13]:
def Vectorization(tf_idf):

    tfidf = np.zeros(len(tf_idf))
    i = 0
    for word in tf_idf:
        tfidf[i] = tf_idf[word]
        i = i + 1
    
    return tfidf

In [14]:
tf = get_tf(X_tokens)
X_vector = []

for i in range(len(X_tokens)):
    id_tf = tf[i]
    idf = get_idf(tf)
    for word in idf:
        idf[word] = idf[word] * id_tf[word]
    X_vector.append(np.array(list(idf.values())))

In [15]:
import csv 
with open('d:/学习/集思项目/final project/tfidf2.csv', 'w', newline='') as csvfile:
    writer  = csv.writer(csvfile)
    for row in X_vector:
        writer.writerow(row)