In [1]:
!pip install nltk spacy pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m50.3 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [10]:
import pandas as pd
df = pd.read_csv('/content/IMDB_Dataset.csv')[:200]  # Use 200 reviews for speed
print(df.head())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [11]:
from nltk.tokenize import word_tokenize, sent_tokenize
review = df['review'].iloc[0]
words = word_tokenize(review)
sentences = sent_tokenize(review)
print(f"First 10 words: {words[:10]}")
print(f"First 2 sentences: {sentences[:2]}")

First 10 words: ['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching']
First 2 sentences: ["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked.", 'They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO.']


In [12]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(review)
words = [token.text for token in doc]
sentences = [sent.text for sent in doc.sents]
print(f"First 10 words: {words[:10]}")
print(f"First 2 sentences: {sentences[:2]}")

First 10 words: ['One', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching']
First 2 sentences: ["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked.", 'They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO.']


In [13]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [w for w in word_tokenize(review.lower()) if w not in stop_words]
print(f"First 10 filtered words: {filtered_words[:10]}")

First 10 filtered words: ['one', 'reviewers', 'mentioned', 'watching', '1', 'oz', 'episode', "'ll", 'hooked', '.']


In [14]:
doc = nlp(review.lower())
filtered_words = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
print(f"First 10 filtered words: {filtered_words[:10]}")

First 10 filtered words: ['reviewers', 'mentioned', 'watching', '1', 'oz', 'episode', 'hooked', '.', 'right', ',']


In [15]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
stemmed_words = [ps.stem(w) for w in filtered_words]
print(f"First 10 stemmed words: {stemmed_words[:10]}")

First 10 stemmed words: ['review', 'mention', 'watch', '1', 'oz', 'episod', 'hook', '.', 'right', ',']


In [16]:
doc = nlp(review.lower())
lemmas = [token.lemma_ for token in doc if token.text not in nlp.Defaults.stop_words]
print(f"First 10 lemmas: {lemmas[:10]}")

First 10 lemmas: ['reviewer', 'mention', 'watch', '1', 'oz', 'episode', 'hook', '.', 'right', ',']


In [17]:
import re
cleaned_review = re.sub(r'http\S+|[^\x00-\x7F]+|[.,!?]', '', review.lower())
print(f"Cleaned review: {cleaned_review[:100]}")

Cleaned review: one of the other reviewers has mentioned that after watching just 1 oz episode you'll be hooked they


In [18]:
import spacy
import pandas as pd
import re

# Load SpaCy
nlp = spacy.load("en_core_web_sm")

# Load data
df = pd.read_csv('/content/IMDB_Dataset.csv')[:200]  # Adjust path

# Preprocessing pipeline
cleaned_data = []
for review in df['review']:
    # Clean
    cleaned = re.sub(r'http\S+|[^\x00-\x7F]+|[.,!?]', '', review.lower())
    # Process with SpaCy
    doc = nlp(cleaned)
    # Tokens (no stopwords)
    tokens = [token.text for token in doc if token.text not in nlp.Defaults.stop_words and token.is_alpha]
    # Lemmas
    lemmas = [token.lemma_ for token in doc if token.text not in nlp.Defaults.stop_words and token.is_alpha]
    cleaned_data.append([review, cleaned, tokens, lemmas])

# Save to CSV
output_df = pd.DataFrame(cleaned_data, columns=['original_text', 'cleaned_text', 'tokens', 'lemmas'])
output_df.to_csv('cleaned_reviews.csv', index=False)
print(output_df.head())

                                       original_text  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        cleaned_text  \
0  one of the other reviewers has mentioned that ...   
1  a wonderful little production <br /><br />the ...   
2  i thought this was a wonderful way to spend ti...   
3  basically there's a family where a little boy ...   
4  petter mattei's "love in the time of money" is...   

                                              tokens  \
0  [reviewers, mentioned, watching, oz, episode, ...   
1  [wonderful, little, production, br, filming, t...   
2  [thought, wonderful, way, spend, time, hot, su...   
3  [basically, family, little, boy, jake, thinks,...   
4  [petter, mattei, love, time, money, visuall