In [1]:
#### import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# feature_extraction is used for converting text into numerical features 
from sklearn import feature_extraction
# linear_model contains various linear models for regression and classification
from sklearn import linear_model
# import train_test_split() function module
from sklearn import model_selection
# load module including utilities for scaling, normalization, ect
from sklearn import preprocessing

In [2]:
# load the train and test datasets
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [3]:
# gain basic structure of the train dataset
print('*' * 100)
print(f'The concise summary of the train dataset:')
train_df.info()

print('*' * 100)
print(f'The shape the train dataset: {train_df.shape}')

print('*' * 100)
print(f'The first 5 samples of the dataset: {train_df.head()}')

print('*' * 100)
print(f'The text in first 5 samples of the dataset:')
print({i : train_df.iloc[i]['text'] for i in range(5)})

****************************************************************************************************
The concise summary of the train dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
****************************************************************************************************
The shape the train dataset: (7613, 5)
****************************************************************************************************
The first 5 samples of the dataset:    id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1 

In [4]:
# create an instance of the CountVectorizer()
vectorizer = feature_extraction.text.CountVectorizer()

# fit the vectorizer to the text data
vectorizer.fit(train_df['text'])

# print the vocabulary of vectorizer built from the tweets
print('*' * 100)
print(f'The vocabularies of tweets: \n{vectorizer.get_feature_names_out()[450:500]}')

# transform the text dataset to a sparse matrix of word counts
# spare matrix only store non-zero values to save memory
X_train = vectorizer.transform(train_df['text'])
# using toarray() to convert spare matrix into normal matrix
print('*' * 100)
print(f'The shape of the numerical vectors of tweets: \n{X_train.toarray().shape}')

****************************************************************************************************
The vocabularies of tweets: 
['2fenu1syu6' '2fggzqn1v4' '2fibe2haxc' '2for1lapdances' '2fs649qdwx'
 '2gljhvead9' '2h0dpmv2ef' '2hocep41kh' '2hours' '2hv2y2m2oz' '2i4eoggo5j'
 '2iafpmqjep' '2ii3brc7nx' '2jbibeib9g' '2jgvhw7yzs' '2jhtlwuey0'
 '2jr3yo55dr' '2jxkmkpalp' '2k13' '2k15' '2kdq56xtws' '2lbtshxi3c'
 '2leezy' '2lgtzkwmqw' '2liwkjybe9' '2lqyxzq5dn' '2m1gneaifl' '2minutemix'
 '2mnqc73hfk' '2mwc9ywjzy' '2nd' '2nip3d15dx' '2nndbgwyei' '2ns5tfnxpa'
 '2o7eva1coe' '2okscwyohc' '2oqsgzqlbz' '2oroyunym2' '2pack' '2pcs'
 '2pimg9bice' '2pm' '2ppzgpxybi' '2q3fuerey5' '2racaivffq' '2rtq9qmgpb'
 '2sdmichb2z' '2sgdofsmrq' '2slow2report' '2snyghaivs']
****************************************************************************************************
The shape of the numerical vectors of tweets: 
(7613, 21637)


In [5]:
# transform the text dataset by vectorizer
X_test = vectorizer.transform(test_df['text'])