# Author
Muhammad Junaid Raza 2409917

## Import Libraries

In [35]:
import wget
import random
import pandas as pd
from datasets import Dataset, DatasetDict

### Downloading the Dataset

In [74]:
# Code Reference:
### https://stackoverflow.com/questions/57748687/downloading-files-in-jupyter-wget-on-windows

In [14]:
# List of URLs to download (positive and negative IMDB reviews)
urls = [
    "http://dl.turkunlp.org/TKO_7095_2023/imdb-positives.txt",
    "http://dl.turkunlp.org/TKO_7095_2023/imdb-negatives.txt",
]

# Loop through each URL and download the file
for url in urls:
    wget.download(url)  # Download the file from the URL
    print(f"Downloaded {url.split('/')[-1]}")  # Print confirmation message with file name

Downloaded imdb-positives.txt
Downloaded imdb-negatives.txt


### Cleaning and loading the data from downloaded files

In [55]:
# Function to clean text by replacing HTML line breaks with actual newlines
def clean_text(text):
    return text.replace("<br /><br />", "\n")  # Replace HTML <br /> tags with a newline

# Read positive reviews from the file
with open("imdb-positives.txt", "r", encoding="utf-8") as f:
    positive_reviews = [clean_text(line.strip()) for line in f.readlines()]  # Read and clean each line

# Read negative reviews from the file
with open("imdb-negatives.txt", "r", encoding="utf-8") as f:
    negative_reviews = [clean_text(line.strip()) for line in f.readlines()]  # Read and clean each line

print(f"Positive reviews: {len(positive_reviews)}")
print(f"Negative reviews: {len(negative_reviews)}")

Positive reviews: 25000
Negative reviews: 25000


### Assigning Labels and Combine Data

In [62]:
# Creating labeled dataset
positive_data = [{"text": text, "label": "positive"} for text in positive_reviews]
negative_data = [{"text": text, "label": "negative"} for text in negative_reviews]

# Merge and shuffle the dataset
all_data = positive_data + negative_data
random.shuffle(all_data) 

print(f"Total Data: {len(all_data)}")

Total Data: 50000


### Spliting the Dataset (80% Train, 10% Validation, 10% Test)

In [64]:
# Calculate split sizes
total_size = len(all_data)
train_size = int(0.8 * total_size)
valid_size = int(0.1 * total_size)

# Perform splitting
train_data = all_data[:train_size]
valid_data = all_data[train_size : train_size + valid_size]
test_data = all_data[train_size + valid_size :]

### Converting to DatasetDict Format

In [65]:
dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(valid_data),
    "test": Dataset.from_list(test_data),
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 40000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})


### Saving datasets as CSV file

In [68]:
# Convert dataset splits to DataFrame
train_df = pd.DataFrame(dataset["train"])
valid_df = pd.DataFrame(dataset["validation"])
test_df = pd.DataFrame(dataset["test"])

# Saving files as CSV
train_df.to_csv("train.csv", index=False)
valid_df.to_csv("validation.csv", index=False)
test_df.to_csv("test.csv", index=False)

print("Datasets saved.")

Datasets saved as CSV files in the current folder.
