In [1]:
import pandas as pd
import numpy as np
import json


In [2]:
# Load Dataset 1.json

# Path to JSON file
file_path = 'datasets/1.json'

# Read and parse the JSON file
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        json_data = json.load(file)
    
    # Convert the JSON data into a Pandas DataFrame
    rows = []
    for entry in json_data:
        for answer in entry['answers']:
            row = {
                'id': entry['id'],
                'date': entry['date'],
                'level': answer['level'],
                'group': answer['group'],
                'members': ', '.join(answer['members'])  # Joining members as a comma-separated string
            }
            rows.append(row)

    # Create the DataFrame
    df1 = pd.DataFrame(rows, columns=['group', 'members'])

    # Display the DataFrame
    display(df1)
except FileNotFoundError:
    print('Error: The file was not found.')
except json.JSONDecodeError as parse_error:
    print('Error parsing JSON:', parse_error)
except Exception as e:
    print('An error occurred:', e)



Unnamed: 0,group,members
0,WET WEATHER,"HAIL, RAIN, SLEET, SNOW"
1,NBA TEAMS,"BUCKS, HEAT, JAZZ, NETS"
2,KEYBOARD KEYS,"OPTION, RETURN, SHIFT, TAB"
3,PALINDROMES,"KAYAK, LEVEL, MOM, RACECAR"
4,FOOTWEAR,"BOOT, LOAFER, PUMP, SNEAKER"
...,...,...
2063,CUT THE ___,"CHEESE, CORD, DECK, MUSTARD"
2064,CONTORTED,"BENT, GNARLY, TWISTED, WARPED"
2065,SMALLEST AMOUNT,"LICK, OUNCE, SHRED, TRACE"
2066,ALGEBRA TERMS,"EXPONENT, POWER, RADICAL, ROOT"


In [3]:
# Split the 'members' column by commas and expand it into individual rows
df1['members'] = df1['members'].str.split(', ')

# Use explode to transform the lists into separate rows
json_df = df1.explode('members').reset_index(drop=True)

json_df.rename(columns={'members': 'words'}, inplace=True)

display(json_df)

Unnamed: 0,group,words
0,WET WEATHER,HAIL
1,WET WEATHER,RAIN
2,WET WEATHER,SLEET
3,WET WEATHER,SNOW
4,NBA TEAMS,BUCKS
...,...,...
8267,ALGEBRA TERMS,ROOT
8268,WORDS BEFORE “ROOM” TO MEAN LAVATORY,BATH
8269,WORDS BEFORE “ROOM” TO MEAN LAVATORY,POWDER
8270,WORDS BEFORE “ROOM” TO MEAN LAVATORY,REST


In [4]:
# Load Dataset 2.parquet

df2 = pd.read_parquet('datasets/2.parquet', engine='pyarrow')
display(df2)

Unnamed: 0,label,text
0,HATS,BERET BOWLER FEDORA FEZ
1,ORGANS,HEART KIDNEY LIVER LUNG
2,PARTS OF A BOOK,COVER JACKET PAGE SPINE
3,EUROPEAN COUNTRIES,DENMARK GREECE POLAND PORTUGAL
4,SYNONYMS FOR IMITATE,COPY ECHO MIMIC PARROT
...,...,...
87,LETTER HOMOPHONES,ARE QUEUE SEA WHY
88,WET WEATHER,HAIL RAIN SLEET SNOW
89,NBA TEAMS,BUCKS HEAT JAZZ NETS
90,KEYBOARD KEYS,OPTION RETURN SHIFT TAB


In [5]:
df2['text'] = df2['text'].str.split(' ')
parquet_df2 = df2.explode('text').reset_index(drop=True)
display(parquet_df2)

Unnamed: 0,label,text
0,HATS,BERET
1,HATS,BOWLER
2,HATS,FEDORA
3,HATS,FEZ
4,ORGANS,HEART
...,...,...
371,PALINDROMES,KAYAK
372,PALINDROMES,LEVEL
373,PALINDROMES,MOM
374,PALINDROMES,RACE


In [6]:
# Load Dataset 3.parquet

df3 = pd.read_parquet('datasets/3.parquet', engine='pyarrow')
display(df3)

Unnamed: 0,label,text
0,MUSICAL INSTRUMENTS,BASS BASSOON HARP RECORDER
1,PLANT GROWTHS,BLOOM BUD SHOOT SPROUT
2,BRING UP,FOSTER NURSE RAISE REAR
3,SOLAR EMANATIONS,CORONA FLARE LIGHT RADIATION
4,HOW FAST SOMETHING IS GOING,CLIP PACE RATE SPEED
...,...,...
727,CLASSIC TOYS,BLOCKS DOLL TOP YO-YO
728,SHAKESPEARE CHARACTERS,DUNCAN JULIET PUCK VIOLA
729,PLACES FOR WORSHIP,ALTAR RELIQUARY SHRINE TEMPLE
730,CARTOON CATS,FELIX GARFIELD SYLVESTER TOM


In [7]:
df3['text'] = df3['text'].str.split(' ')
parquet_df3 = df3.explode('text').reset_index(drop=True)
display(parquet_df3)

Unnamed: 0,label,text
0,MUSICAL INSTRUMENTS,BASS
1,MUSICAL INSTRUMENTS,BASSOON
2,MUSICAL INSTRUMENTS,HARP
3,MUSICAL INSTRUMENTS,RECORDER
4,PLANT GROWTHS,BLOOM
...,...,...
2938,CARTOON CATS,TOM
2939,PRESIDENTIAL FIRST NAMES,CALVIN
2940,PRESIDENTIAL FIRST NAMES,CHESTER
2941,PRESIDENTIAL FIRST NAMES,GROVER


In [8]:
# Load Dataset 4.parquet

df4 = pd.read_parquet('datasets/4.parquet', engine='pyarrow')
display(df4)

Unnamed: 0,label,text
0,ANIMAL GROUP NAMES,COLONY HERD PRIDE SWARM
1,STONED,BAKED BLAZED HIGH LIT
2,AP CLASSES,BIO CHEM GOV STATS
3,TAXONOMY RANKS,CLASS DOMAIN FAMILY ORDER
4,COMPUTER EQUIPMENT,KEYBOARD MONITOR MOUSE SPEAKER
...,...,...
86,FISH,TANG TETRA SKATE SOLE
87,SEVEN DWARFS,BASHFUL DOC GRUMPY HAPPY
88,FILE EXTENSIONS,GIF PDF TIFF ZIP
89,FLIGHTLESS BIRDS,EMU KIWI OSTRICH PENGUIN


In [9]:
df4['text'] = df4['text'].str.split(' ')
parquet_df4 = df4.explode('text').reset_index(drop=True)
display(parquet_df4)

Unnamed: 0,label,text
0,ANIMAL GROUP NAMES,COLONY
1,ANIMAL GROUP NAMES,HERD
2,ANIMAL GROUP NAMES,PRIDE
3,ANIMAL GROUP NAMES,SWARM
4,STONED,BAKED
...,...,...
363,FLIGHTLESS BIRDS,PENGUIN
364,TROPICAL FRUITS,BANANA
365,TROPICAL FRUITS,COCONUT
366,TROPICAL FRUITS,MANGO


In [10]:
# Combine Parquet datasets

parquet_df = pd.merge(parquet_df2, parquet_df3, how='outer').drop_duplicates()
parquet_df = pd.merge(parquet_df, parquet_df4, how='outer').drop_duplicates()

parquet_df.rename(columns={'label':'group', 'text': 'words'}, inplace=True)

display(parquet_df)

Unnamed: 0,group,words
0,HATS,BERET
1,HATS,BOWLER
2,HATS,FEDORA
3,HATS,FEZ
4,ORGANS,HEART
...,...,...
3612,FLIGHTLESS BIRDS,PENGUIN
3613,TROPICAL FRUITS,BANANA
3614,TROPICAL FRUITS,COCONUT
3615,TROPICAL FRUITS,MANGO


In [11]:
# Load Dataset 5.csv

df5 = pd.read_csv('datasets/5.csv', usecols=['Word', 'Group Name'])
display(df5)

Unnamed: 0,Word,Group Name
0,SNOW,WET WEATHER
1,LEVEL,PALINDROMES
2,SHIFT,KEYBOARD KEYS
3,KAYAK,PALINDROMES
4,HEAT,NBA TEAMS
...,...,...
8123,VIPER,SPORTS CARS
8124,SCAVENGER,___ HUNT
8125,MUSTANG,SPORTS CARS
8126,WINK,THINGS YOU CAN DO WITH YOUR EYELIDS


In [12]:
# Load Dataset 6.csv

df6 = pd.read_csv('datasets/6.csv', usecols=['Word', 'Group Name'])
display(df6)

Unnamed: 0,Word,Group Name
0,SNOW,WET WEATHER
1,LEVEL,PALINDROMES
2,SHIFT,KEYBOARD KEYS
3,KAYAK,PALINDROMES
4,HEAT,NBA TEAMS
...,...,...
8267,BENT,CONTORTED
8268,TRACE,SMALLEST AMOUNT
8269,WARPED,CONTORTED
8270,EXPONENT,ALGEBRA TERMS


In [13]:
# Combine Dataframes

csv_df = pd.merge(df5, df6, how='outer').drop_duplicates()
csv_df.rename(columns={'Word': 'words', 'Group Name': 'group'}, inplace=True)

display(csv_df)

Unnamed: 0,words,group
0,SNOW,WET WEATHER
1,LEVEL,PALINDROMES
2,SHIFT,KEYBOARD KEYS
3,KAYAK,PALINDROMES
4,HEAT,NBA TEAMS
...,...,...
8729,BENT,CONTORTED
8730,TRACE,SMALLEST AMOUNT
8731,WARPED,CONTORTED
8732,EXPONENT,ALGEBRA TERMS


In [22]:
#Combine all dataframes

df = pd.concat([json_df, parquet_df, csv_df], ignore_index=True).drop_duplicates()
display(df)

Unnamed: 0,group,words
0,WET WEATHER,HAIL
1,WET WEATHER,RAIN
2,WET WEATHER,SLEET
3,WET WEATHER,SNOW
4,NBA TEAMS,BUCKS
...,...,...
17380,WORDS THAT SOUND LIKE PLURAL LETTERS,TEASE
17381,WORDS THAT SOUND LIKE PLURAL LETTERS,WISE
17388,WORDS THAT SOUND LIKE PLURAL LETTERS,GEEZ
17391,WORDS THAT SOUND LIKE PLURAL LETTERS,SEIZE


In [1]:
# Export
df.to_csv('datasets/dataset.csv', index=False)

NameError: name 'df' is not defined