# Tokenize Text Columns Into Sentences

Required libraries
<br>
[pip install spacy](https://pypi.org/project/spacy/)

In [None]:
# Import Dependencies and setup
import pandas as pd
import os

In [None]:
# read csv output from Instagrapy_split_text.ipynb
df=pd.read_csv("../../resources/ig_datascrape_jc_2021-08-25.csv", encoding="ISO 8859-1")
df.head(2)

In [None]:
# count number of rows in DataFrame
number_of_rows = len(df)

number_of_rows

In [None]:
# drop Unnamed columns
df =df.drop(['Unnamed: 0.1'], axis=1)
df = df.reset_index(drop=True)

# convert epoch time to datetime
df['timestamp'] = pd.to_datetime(df['timestamp'],unit='s')

# assign first_text to first row's "text" column
ig_text = df.loc[0, "text"]
print(ig_text)

In [None]:
df.head()

## Tokenization

Resources to better understand text preprocessing
<br>
[Tokenize Text Columns Into Sentences in Pandas](https://towardsdatascience.com/tokenize-text-columns-into-sentences-in-pandas-2c08bc1ca790)
<br>
Note that v3 of spacy replaces "nlp.create_pipe", with "nlp.add_pipe('sentencizer')"

In [None]:
# required library and a spacy model

# !pip install spacy
# !python -m spacy download en_core_web_sm

In [None]:
# Test. Tokenize using spaCy
import spacy

nlp = spacy.load("en_core_web_sm")
[sent.text for sent in nlp(ig_text).sents]

In [None]:
from spacy.lang.en import English

nlp = English()  # just the language with no model
sentencizer = nlp.add_pipe('sentencizer')

In [None]:
[sent.text for sent in nlp(ig_text).sents]

# END Test

In [None]:
# tokenize all data, in column "text", using lambda function
# this was a pain. some elements were ints or floats, causing mixed returns of a dtype 
# object type. This stopped the script from filtering it out, returning a "nlp object 
# of type 'float' has no len()". the workaround is to turn everything into a string

nlp = spacy.load("en_core_web_sm")
df["text"] = df["text"].apply(lambda x: [sent.text for sent in (nlp(str(x)).sents)])
