In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load data

In [2]:
folder_input_path = '/content/drive/My Drive/Colab Notebooks/Ch3_Corpora/'

In [3]:
import pandas as pd

# Load the uploaded file
file_path = 'sentimental_paraphrase_1-200.src.tsv'
data = pd.read_csv(folder_input_path + file_path, sep='\t', names=['label', 'sentence'])

# Display the first few rows of the dataframe to understand its structure
data.head()

Unnamed: 0,label,sentence
0,original,The weather is unpredictable today.
1,positive,The weather today holds exciting surprises.
2,negative,Today's weather is frustratingly inconsistent.
3,original,The food at this restaurant is decent.
4,positive,The cuisine at this restaurant is delightfully...


# Check duplicities

In [4]:
# Checking for duplicate sentences in the dataset
duplicate_sentences = data[data.duplicated(keep=False)]

# Displaying the duplicates if found
duplicate_sentences

Unnamed: 0,label,sentence
51,original,The meeting was somewhat productive.
255,original,The meeting was somewhat productive.


# Transpose data from rows to columns

In [5]:
# Create a new DataFrame for the transposed structure
transposed_data = pd.DataFrame(columns=["original", "positive", "negative"])

# Iterate over the original dataset and populate the new DataFrame
rows = []
for i in range(0, len(data), 3):
    original = data.iloc[i]["sentence"] if i < len(data) else None
    positive = data.iloc[i+1]["sentence"] if i+1 < len(data) else None
    negative = data.iloc[i+2]["sentence"] if i+2 < len(data) else None
    rows.append({"original": original, "positive": positive, "negative": negative})

# Using pandas.concat for efficient concatenation
transposed_data = pd.concat([transposed_data, pd.DataFrame(rows)], ignore_index=True)

# Save the transposed DataFrame to a CSV file
transposed_file_path = 'sentimental_paraphrase_1-200_transposed.csv'
transposed_data.to_csv(folder_input_path + transposed_file_path, index=False)

# Display the transposed data
transposed_data.head()


Unnamed: 0,original,positive,negative
0,The weather is unpredictable today.,The weather today holds exciting surprises.,Today's weather is frustratingly inconsistent.
1,The food at this restaurant is decent.,The cuisine at this restaurant is delightfully...,The food at this restaurant is barely passable.
2,This movie has an interesting plot.,This movie boasts a captivating and thought-pr...,The plot of this movie is confusing and poorly...
3,Our team has a fair chance of winning.,Our team is well-positioned for a triumphant v...,Our team's chances of winning are quite doubtful.
4,The new software update includes several featu...,The latest software update is packed with inno...,The new software update is cluttered with unne...
