In [12]:
import pandas as pd

### Based on the exploratory data analysis (EDA) performed on the curated dataset, we proceed with preprocessing the data.

In [25]:
#Loading curated data

df=pd.read_csv("finalDataset.csv")
df['full_image_path'] = df['full_image_path'].str.replace("\\", "/", regex=False)

In [14]:
df

Unnamed: 0,image_id,full_image_path,question,answer
0,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,What shape is this?,Rectangular
1,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,Is it a cover?,Yes
2,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,What color is bottom?,Red
3,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,Are there multiple colors?,Yes
4,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,Is there a camera hole?,Yes
...,...,...,...,...
159145,81yoY30yFiL,abo-images-small/images/small/7a/7ab6a11b.jpg,What word is centered on the cover?,Super
159146,81yoY30yFiL,abo-images-small/images/small/7a/7ab6a11b.jpg,What color is the written text?,Yellow
159147,81yoY30yFiL,abo-images-small/images/small/7a/7ab6a11b.jpg,What is the shape of the cover?,Rectangle
159148,81yoY30yFiL,abo-images-small/images/small/7a/7ab6a11b.jpg,What is the background comprised of?,Fragments


In [26]:
print("Initial shape:", df.shape)


Initial shape: (159150, 4)


In [27]:
#  Lowercase
df['question'] = df['question'].str.lower()
df['answer'] = df['answer'].str.lower()

### Dropping null values

In [28]:
print("Null value counts:\n", df.isnull().sum())
print("Rows with nulls:", df.isnull().any(axis=1).sum())

Null value counts:
 image_id            0
full_image_path     0
question            0
answer             54
dtype: int64
Rows with nulls: 54


In [29]:
# Drop nulls
df = df.dropna()
print("After dropping nulls:", df.shape)

After dropping nulls: (159096, 4)


### Handling yes/no answers

- As we saw in EDA, almost 45% of answers are yes/no. So we have to preprocess such answers

In [30]:
yes_no_count = df['answer'].str.lower().isin(['yes', 'no']).sum()
print("Rows with yes/no answers:", yes_no_count)

Rows with yes/no answers: 72491


In [31]:
yes_mask = df['answer'].str.lower() == 'yes'
no_mask = df['answer'].str.lower() == 'no'
yes_count = yes_mask.sum()
no_count = no_mask.sum()
print(f"Original 'yes' count: {yes_count}")
print(f"Original 'no' count: {no_count}")

Original 'yes' count: 64883
Original 'no' count: 7608


In [32]:
# Sample yes/no such that it forms just 2% of whole dataset 

total_rows = len(df)
target_yes_no = int(0.01 * total_rows)  


yes_sampled = df[yes_mask].sample(n=min(target_yes_no, yes_count), random_state=42)
no_sampled = df[no_mask].sample(n=min(target_yes_no, no_count), random_state=42)


In [33]:
# non-yes/no rows
non_yes_no_df = df[~(yes_mask | no_mask)]
print("Non-yes/no rows:", non_yes_no_df.shape)

Non-yes/no rows: (86605, 4)


In [34]:
df = pd.concat([non_yes_no_df, yes_sampled, no_sampled], ignore_index=True)
print("After limiting yes/no :", df.shape)

yes_mask = df['answer'].str.lower() == 'yes'
no_mask = df['answer'].str.lower() == 'no'
yes_count = yes_mask.sum()
no_count = no_mask.sum()
print(f" 'yes' count: {yes_count}")
print(f" 'no' count: {no_count}")



After limiting yes/no : (89785, 4)
 'yes' count: 1590
 'no' count: 1590


In [35]:
df

Unnamed: 0,image_id,full_image_path,question,answer
0,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,what shape is this?,rectangular
1,81+4dBN1jsL,abo-images-small/images/small/9d/9dfccb37.jpg,what color is bottom?,red
2,71tgJqobw6L,abo-images-small/images/small/77/77412532.jpg,what shape are the designs?,butterflies
3,71tgJqobw6L,abo-images-small/images/small/77/77412532.jpg,what color is the case?,black
4,71tgJqobw6L,abo-images-small/images/small/77/77412532.jpg,what is the light effect?,neon
...,...,...,...,...
89780,61VbmqSEozL,abo-images-small/images/small/a5/a5c5da7a.jpg,is the background white?,no
89781,81EdKcYz63L,abo-images-small/images/small/a4/a4e56c51.jpg,is it rectangular?,no
89782,81neFBcDQPL,abo-images-small/images/small/bd/bd539cac.jpg,is it a foldable phone?,no
89783,81o+rtsQmIL,abo-images-small/images/small/95/95fdb0fb.jpg,are the tiles uniform?,no


### Answer distribution

In [510]:
# We have seen the Answer distribution in EDA, so we make sure that the maximum count per answer is just 3% for a better distribution

total_rows = len(df)
max_allowed_count = int(0.03 * total_rows) 
new_dfs = []

for ans, count in df['answer'].value_counts().items():
    ans_df = df[df['answer'] == ans]
    if count > max_allowed_count:
        # randomly sample only max_allowed_count rows
        ans_df = ans_df.sample(n=max_allowed_count, random_state=42)
  
    new_dfs.append(ans_df)


df_reduced = pd.concat(new_dfs, ignore_index=True)


df_reduced = df_reduced.sample(frac=1, random_state=42).reset_index(drop=True)
df=df_reduced

print(f"Original size: {total_rows}, Reduced size: {len(df_reduced)}")


Original size: 89785, Reduced size: 78883


In [511]:
print(f"no of datapoints {len(df)}")


no of datapoints 78883


### In EDA, we saw word count of questions, and we noticed that there are  questions with just one word, so we handle them here

In [512]:
# one-word questions
word_counts = df['question'].str.split().str.len()
one_word_count = (word_counts == 1).sum()
print("One-word questions:", one_word_count)

One-word questions: 3399


In [513]:
# Drop rows where the question has only one word
df = df[word_counts > 1]

In [514]:
print(f"no of datapoints {len(df)}")


no of datapoints 75484


In [515]:
df

Unnamed: 0,image_id,full_image_path,question,answer
0,71VyNJh1ApL,abo-images-small/images/small/64/649b029e.jpg,what letter is visible?,d
1,71NlUsMrL+L,abo-images-small/images/small/6d/6d15f0f9.jpg,what color is the background of the right-side...,yellow
2,71S1WDtkFpL,abo-images-small/images/small/d4/d403a369.jpg,what color is the jacket?,yellow
3,71RFBqFrhzL,abo-images-small/images/small/c6/c6c40357.jpg,what color is predominant?,pink
4,81BCgIYDMYL,abo-images-small/images/small/31/31a770ea.jpg,are there more dark or light parts?,light
...,...,...,...,...
78878,81fgb2JAKzL,abo-images-small/images/small/8b/8bcd2fbd.jpg,what is the boot's color?,blue
78879,71FjSurSI5L,abo-images-small/images/small/0b/0b9a2467.jpg,what is the background texture?,wood
78880,71Ag3cMsZSL,abo-images-small/images/small/5f/5f854cc6.jpg,what type of shoe is this?,boot
78881,71RNMsV8HEL,abo-images-small/images/small/4b/4bb53f13.jpg,how many visible ports are there?,two


### We randomly sample 70k datapoints out of 75484 datapoints

In [516]:
# Random sample 70k
df_sampled = df.sample(n=70000, random_state=42).reset_index(drop=True)
print("After sampling 60k:", df_sampled.shape)

After sampling 60k: (70000, 4)


In [517]:
# Answer distribution
answer_counts = df_sampled['answer'].value_counts().reset_index()
answer_counts.columns = ['answer', 'count']
answer_counts['%'] = (answer_counts['count'] / len(df_sampled)) * 100

yes_count = df_sampled[df_sampled['answer'].str.lower() == 'yes'].shape[0]
no_count = df_sampled[df_sampled['answer'].str.lower() == 'no'].shape[0]

print(f"'Yes' answers: {yes_count} ({yes_count / 70000 * 100:.2f}%)")
print(f"'No' answers: {no_count} ({no_count / 70000 * 100:.2f}%)")
print(answer_counts.head(10))

'Yes' answers: 1351 (1.93%)
'No' answers: 1451 (2.07%)
        answer  count         %
0        three   2504  3.577143
1          one   2481  3.544286
2         pink   2456  3.508571
3         blue   2455  3.507143
4          two   2450  3.500000
5          red   2440  3.485714
6        black   2386  3.408571
7  rectangular   2068  2.954286
8       yellow   1913  2.732857
9       circle   1787  2.552857


### Train-val split (80%-20%)

In [518]:
#Train/Val Split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df_sampled, test_size=0.2, random_state=42)
print("Train shape:", train_df.shape)
print("Val shape:", val_df.shape)

Train shape: (56000, 4)
Val shape: (14000, 4)


In [522]:
# We save it into csv 

train_df.to_csv("train_datapoints.csv", index=False)
val_df.to_csv("val_datapoints.csv", index=False)


In [520]:
train_df.head()

Unnamed: 0,image_id,full_image_path,question,answer
47339,619aXYWDlfL,abo-images-small/images/small/82/8281b602.jpg,what color is the ring?,white
67456,71oqXnBtfUL,abo-images-small/images/small/eb/ebb8f08b.jpg,what color is background?,pink
12308,618rflVTnTL,abo-images-small/images/small/b9/b9c8a85a.jpg,how many stripes are visible?,three
32557,61oFSxHBFLL,abo-images-small/images/small/68/6876f38e.jpg,what is the product?,case
664,717qOUqAkFL,abo-images-small/images/small/a9/a920ed25.jpg,what is the shape surrounding the image?,oval


In [521]:
val_df.head()

Unnamed: 0,image_id,full_image_path,question,answer
46730,71uK-r+TRtL,abo-images-small/images/small/bc/bc859253.jpg,what is the shape?,rectangular
48393,81qDgeDk8UL,abo-images-small/images/small/dc/dc90df81.jpg,"what is more prominent, the dark or light part?",dark
41416,61JHYkmJsKL,abo-images-small/images/small/88/88d73c91.jpg,"which word is smaller, ""time"" or ""aayega""?",time
34506,51Ftiw-ZG8L,abo-images-small/images/small/a9/a9ebb518.jpg,what time is shown?,8:08
43725,61S5Jup3CJL,abo-images-small/images/small/d7/d7e9fd72.jpg,what color is the background?,pink
