In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
# Load vocabulary list
words = pd.read_csv("flashcard-vocabulary-list.csv", encoding='latin-1')
words.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; To lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [3]:
# Need to remove leading spaces in definition column to sort values alphabetically
words["Definition"] = words["Definition"].apply(lambda x: x.lstrip())
words.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; To lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [4]:
# Need to capitalize first word in definitons for consistency
# NOTE: words are intentionally left lowercase
words["Definition"] = words["Definition"].apply(lambda x: x.capitalize())
words.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [5]:
# Need to remove leading spaces in Word column to sort values alphabetically
words["Word"] = words["Word"].apply(lambda x: x.lstrip())
words.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [6]:
sorted_words = words.sort_values("Word")
sorted_words.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [7]:
# Need to reset index of words and definitions once in alphabetical order
sorted_words = sorted_words.reset_index()
sorted_words.head(6)

Unnamed: 0,index,Word,Definition
0,0,abacus,Frame with balls for calculating
1,1,abase,"Degrade or humble; to lower in rank, status, o..."
2,2,abate,"Reduce, diminish; to lessen to subside"
3,3,abdicate,Formally give up the throne (or some other pow...
4,4,abdication,Giving up control authority
5,5,aberrant,"Abnormal, deviant"


In [8]:
# Need to drop previous index of words and definitions after resetting index and sorting words in alphabetical order
sorted_words.drop("index", axis = 1, inplace = True)
sorted_words.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [9]:
def check_duplicates(list_of_words):
    result = False
    setofwords = set()
    for elem in list_of_words:
        if elem in setofwords:
            result = True
        else: 
            setofwords.add(elem)
    if result == True:
        print("This list contains duplicated words.")
    elif result == False:
        print("There are no duplicate words.")

check_duplicates(sorted_words["Word"])

There are no duplicate words.


In [10]:
# Need to show which words are duplicated
# Select duplicate rows except first occurrence based on all columns
duplicate_df = sorted_words[sorted_words.duplicated(['Word'], keep = False)]
duplicate_df.head(6)

Unnamed: 0,Word,Definition


In [11]:
# Need to create two different dataframes to merge duplicated values
# Select duplicate rows except first occurrence based on all columns
duplicate_df_first = sorted_words[sorted_words.duplicated(['Word'], keep = 'first')]
duplicate_df_first.head(6)

Unnamed: 0,Word,Definition


In [12]:
# Need to create two different dataframes to merge duplicated values
# Select duplicate rows except first occurrence based on all columns
duplicate_df_last = sorted_words[sorted_words.duplicated(['Word'], keep = 'last')]
duplicate_df_last.head(6)

Unnamed: 0,Word,Definition


In [13]:
duplicate_rows = duplicate_df.reset_index()
duplicate_rows.drop("index", axis = 1, inplace = True)
duplicate_rows.head(6)

Unnamed: 0,Word,Definition


In [14]:
merge_words = pd.merge(duplicate_df_first, duplicate_df_last, on = ['Word'])
merge_words.head(6)

Unnamed: 0,Definition_x,Word,Definition_y


In [15]:
merge_words['Definition'] = merge_words[["Definition_x", "Definition_y"]].apply(lambda x: "; ".join(x), axis=1)
merge_words.head(6)

Unnamed: 0,Definition_x,Word,Definition_y,Definition


In [16]:
merge_words.drop(["Definition_x", "Definition_y"], axis = 1, inplace = True)
merge_words.head(6)

Unnamed: 0,Word,Definition


In [17]:
duplicates_dropped = sorted_words.drop_duplicates(["Word"], keep = False)
duplicates_dropped.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [18]:
vocabulary_list = pd.concat([duplicates_dropped, merge_words])
vocabulary_list.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [19]:
# Sorting word values alphabetically again
vocabulary_list = vocabulary_list.sort_values("Word")
vocabulary_list.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [20]:
# Need to reset index of words and definitions once in alphabetical order again
# Need to drop previous index of words and definitions after resetting index and sorting words in alphabetical order
vocabulary_list = vocabulary_list.reset_index()
vocabulary_list.drop("index", axis = 1, inplace = True)
vocabulary_list.head(6)

Unnamed: 0,Word,Definition
0,abacus,Frame with balls for calculating
1,abase,"Degrade or humble; to lower in rank, status, o..."
2,abate,"Reduce, diminish; to lessen to subside"
3,abdicate,Formally give up the throne (or some other pow...
4,abdication,Giving up control authority
5,aberrant,"Abnormal, deviant"


In [21]:
# Need to double check for duplicates again
def check_duplicates(list_of_words):
    result = False
    setofwords = set()
    for elem in list_of_words:
        if elem in setofwords:
            result = True
        else: 
            setofwords.add(elem)
    if result == True:
        print("This list contains duplicated words.")
    elif result == False:
        print("There are no duplicate words.")

check_duplicates(vocabulary_list["Word"])

There are no duplicate words.


In [22]:
vocabulary_list = vocabulary_list.set_index('Word')

In [23]:
vocabulary_list.head(6)

Unnamed: 0_level_0,Definition
Word,Unnamed: 1_level_1
abacus,Frame with balls for calculating
abase,"Degrade or humble; to lower in rank, status, o..."
abate,"Reduce, diminish; to lessen to subside"
abdicate,Formally give up the throne (or some other pow...
abdication,Giving up control authority
aberrant,"Abnormal, deviant"


In [24]:
# Need to create a copy of sorted and cleaned vocabulary that is different from raw data .csv file 
vocabulary_list.to_csv(r'C:\Users\jacqu\Desktop\Python Portfolio\Vocabulary Flashcard Quiz\vfq vocabulary list.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\jacqu\\Desktop\\Python Portfolio\\Vocabulary Flashcard Quiz\\vfq vocabulary list.csv'