W tym notebooku chcemy mieć kod, który robi tf_idf z Narrative, łączy wynik z pozostałymi danymi i generuje gotowe dane do modelu.

In [1]:
import numpy as np
import pandas as pd
import nltk
import spacy
from matplotlib import pyplot as plt
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import PCA
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import norm, uniform
from pywsd.utils import lemmatize_sentence

pd.set_option('display.max_columns', None)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')


Warming up PyWSD (takes ~10 secs)... took 2.6618850231170654 secs.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ivansmaliakou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ivansmaliakou/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
df = pd.read_csv('../data/4_one_hot/processed_data.csv')

In [3]:
text_data = df.loc[:, df.dtypes == 'object']

In [4]:
df['NARRATIVE'].isna().sum()

43

In [5]:
df['NARRATIVE'] = df['NARRATIVE'].fillna('')

In [6]:
nlp = spacy.load('en_core_web_sm')
narrative_sentences = []
for raw_sent in tqdm(df['NARRATIVE']):
    doc = nlp(''.join([word.lower() for word in raw_sent]))
    lemmatized_tokens = [token.lemma_ for token in doc]
    lemmatized_text = ' '.join(lemmatized_tokens)
    narrative_sentences.append(lemmatized_text)

100%|██████████| 8134/8134 [03:45<00:00, 36.09it/s]


In [7]:
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tfidf_wm = tfidfvectorizer.fit_transform(narrative_sentences)
tfidf_tokens = tfidfvectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)

In [8]:
df_tfidfvect.sum(axis=0).sort_values(ascending=False).head(20)

valve            393.468904
leak             317.642908
line             306.344810
gas              302.396936
pipeline         287.248559
release          278.986152
pump             253.508477
station          232.002137
tank             229.311965
pipe             200.652560
approximately    199.510223
cause            189.457526
report           189.345526
oil              188.972007
service          180.709613
repair           167.013101
product          160.522691
crude            158.419888
pressure         158.360736
seal             158.013205
dtype: float64

In [9]:
pca = PCA(svd_solver='randomized', n_components=500)
reduced_tfidf = pca.fit_transform(df_tfidfvect)

In [10]:
reduced_tfidf.shape

(8134, 500)

In [11]:
column_names = [f'narrative_tfidf-PC-{i}' for i in range(500)]
tfidf = pd.DataFrame(reduced_tfidf, columns=column_names)
tfidf.to_csv('../data/5_tf_idf/tf-idf.csv', index=False)