# Vectorizing the data, and save the new feature extracted dataset


In [1]:
# system tools
import os
import sys

# data munging tools
import pandas as pd

# Machine learning stuff
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer



In [2]:
filepath= os.path.join("..","..","cds-language", "data", "fake_or_real_news.csv")
#Read in data 
news_data = pd.read_csv(filepath)

In [3]:
news_data

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL


In [4]:
# create data variables containing data and labels
X = news_data["text"]
y = news_data["label"]

In [6]:
# make train/test split 

X_train, X_test, y_train, y_test = train_test_split(X,           # texts for the model
                                                    y,          # classification labels
                                                    test_size=0.2,   # create an 80/20 split
                                                    random_state=42) # random state for reproducibility

In [11]:
# make vectorizer 

vectorizer = TfidfVectorizer(ngram_range = (1,2),     # unigrams and bigrams (1 word and 2 word units)
                             lowercase =  True,       # why use lowercase?
                             max_df = 0.95,           # remove very common words
                             min_df = 0.05,           # remove very rare words
                             max_features = 500)      # keep only top 500 features


#save it in the models folder
from joblib import dump, load
dump(vectorizer,"../models/tfidf_vectorizer.joblib")

['../models/tfidf_vectorizer.joblib']

In [8]:
# fit the vectorizer to the data

# first we fit to the training data...
X_train_feats = vectorizer.fit_transform(X_train)

#... then do it for our test data
X_test_feats = vectorizer.transform(X_test)

# get feature names
feature_names = vectorizer.get_feature_names_out()

In [9]:
X_train_feats

<5068x500 sparse matrix of type '<class 'numpy.float64'>'
	with 720821 stored elements in Compressed Sparse Row format>

In [10]:
type(X_train_feats)

scipy.sparse._csr.csr_matrix

In [12]:
import scipy as sp

# save the spicy sparse matrix in the feature_extracted_object folder
sp.sparse.save_npz('../feature_extracted_object/X_train_feats.npz', X_train_feats)
sp.sparse.save_npz('../feature_extracted_object/X_test_feats.npz', X_test_feats)
