#  Text Data Preprocessing

In [1]:
# Import necessary libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import string
# from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load dataset

df = pd.read_csv("../DATA/Text_data.csv")
df.head()

Unnamed: 0,text
0,"The AI revolution is, here."
1,seaborn and matploblib are visualization tools.
2,Pandas is a powerful% data analysis library.
3,Data analysis is a crucial part of machine lea...
4,Python is a popular programming language for d...


In [3]:
# Check for duplicates

df.duplicated('text').sum()

0

In [4]:
# OR
df.duplicated(subset=['text']).any()

False

In [5]:
# Make copy of df
df_copy = df.copy()

# Change text column to lower case
df_copy['text'] = df_copy['text'].str.lower()
df_copy.head()

Unnamed: 0,text
0,"the ai revolution is, here."
1,seaborn and matploblib are visualization tools.
2,pandas is a powerful% data analysis library.
3,data analysis is a crucial part of machine lea...
4,python is a popular programming language for d...


In [6]:
# Remove special characters
df_copy['text'] = df_copy['text'].str.translate(str.maketrans('','',string.punctuation))
df_copy.head()

Unnamed: 0,text
0,the ai revolution is here
1,seaborn and matploblib are visualization tools
2,pandas is a powerful data analysis library
3,data analysis is a crucial part of machine lea...
4,python is a popular programming language for d...


In [7]:
# Column of length text column
df_copy['text length'] = df_copy['text'].str.len()
df_copy.head()

Unnamed: 0,text,text length
0,the ai revolution is here,25
1,seaborn and matploblib are visualization tools,46
2,pandas is a powerful data analysis library,42
3,data analysis is a crucial part of machine lea...,51
4,python is a popular programming language for d...,57


In [8]:
# Check longest text
df_copy.sort_values(by='text length', ascending=False).head(1)

Unnamed: 0,text,text length
8,big data is becoming increasingly important in...,59


In [9]:
# OR
df_copy.sort_values(by='text length', ascending=False).iloc[:1]

Unnamed: 0,text,text length
8,big data is becoming increasingly important in...,59


In [10]:
# Remove stopwords

# Access English stopwords
# stop_words = set(stopwords.words('english'))

# df_copy['text'] = df_copy['text'].apply(lambda x: ' '.join([word for word in x.split()
#                                                             if word not in stop_words]))
# df_copy.head()

In [11]:
# Instantiate the vectorizer
cv = CountVectorizer()

# Fit and transform text with model
words_matrix = cv.fit_transform(df_copy['text'])

# Return df of vectorized text
words_df =pd.DataFrame(words_matrix.toarray(),columns=cv.get_feature_names_out())
words_df.head()


Unnamed: 0,ai,analysis,and,are,becoming,big,changing,crucial,data,for,...,science,seaborn,statistics,subfield,the,todays,tools,transforming,visualization,world
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,0,1,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,1,...,1,0,0,0,0,0,0,0,0,0
