In [375]:
# Make Jupyter reload library before every execution

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Translation

Each data file is in a different language. We need to translate them into English before concatenation.
First, we'll examine the headers

In [376]:
# We code some function for preprocessing here
import utils

In [377]:
import pandas as pd

# Load the survey data files
file_paths = {
    'English': 'data/en.csv',
    'Bengali': 'data/bengali.csv',
    'Korean': 'data/kr.csv',
    'Vietnamese': 'data/vi.csv'
}

# Read the headers of each file
dfs = {}
headers = {}
for language, file_path in file_paths.items():
    df = pd.read_csv(file_path)  # Read only headers
    dfs[language] = df
    headers[language] = df.columns.tolist()

headers


{'English': ['Timestamp',
  'What is your age group?',
  'What is your gender?',
  'What is the highest level of education you have completed?',
  'Which category best describes your occupation?',
  'On average, how many days per week do you exercise for at least 30 (or under 30 mins but high intensity) minutes? ',
  'On average, how many hours per day do you use electronic devices such as smartphones, laptops, televisions, etc.? ',
  'On average, how much time do you usually spend on phone / computers before sleep?',
  '(Optional) What is your height in centimeters?',
  '(Optional) What is your weight in kilograms?',
  'On average, what time do you typically go to bed at night?',
  'On average, what time do you typically wake up in the morning?',
  'On average, how long does it take you to fall asleep at night?',
  'On average, how long is your typical daytime nap?',
  'On average, how many hours do you sleep per 24-hour period?',
  'How would you rate your overall sleep quality?',
  

Let's drop Username and Email columns for privacy

In [378]:
for lang, df in dfs.items():
    try:
        # Email column is the last column
        df.drop(labels=df.columns[-1], axis=1, inplace=True)

        # Drop username column if present
        df.drop(columns='Username', inplace=True)
    except Exception as err:
        pass
        # print(err)

As you can see, column headers are lengthy, which could make it harder for us to analyze. In the next section, we will translate and shorten them while trying to maintaining the original meaning as close as possible.

## Bengali

### Translate Headers to English

In [379]:

dfs['Bengali'].head(3)

Unnamed: 0,Timestamp,আপনার বয়স কত?*,আপনার লিঙ্গ কি? *,আপনার শিক্ষাগত যোগ্যতা কি? *\n,আপনার পেশা কি? *,গড়ে সপ্তাহে কয়দিন আপনি কমপক্ষে ৩০ মিনিট বা তার চেয়ে বেশি সময়ের জন্য ব্যায়াম করেন? *,"গড়ে আপনি কত ঘণ্টা ইলেকট্রনিক ডিভাইস ব্যবহার করেন, যেমন স্মার্টফোন, ল্যাপটপ, টেলিভিশন, ইত্যাদি?",গড়ে আপনি ঘুমানোর আগে ফোনে / ল্যপটপে কতটুকু সময় অতিবাহিত করেন?,(ঐচ্ছিক) আপনার উচ্চতা কত? (সে.মি. এককে),(ঐচ্ছিক) আপনার ওজন কত? (কেজি তে),রাতে সাধারনত কখন ঘুমাতে যান?,সাধারনত আপনি সকালে কখন ঘুম থেকে উঠেন?,সাধারনত শোয়ার কতক্ষনের মধ্যে আপনার ঘুম চলে আসে?,"আপনি কি দিনের বেলা সামান্য ঘুমিয়ে নেন? যদি উত্তর হ্যা হয়, তাহলে তা কতক্ষন?",সব মিলিয়ে গড়ে আপনি ২৪ ঘণ্টার মধ্যে কত ঘণ্টা ঘুমান?,আপনার সামগ্রীক ঘুমের মান কেমন থাকে?,"আপনি কি পরিমান ঘুমের মধ্য বিঘ্নতা পরিলক্ষন করেন, যেমন রাতে ঘুম ভেঙ্গে যাওয়া বা খুব হালকা ঘুম?",আপনি কি ঘুমের জন্য কোন ঔষধ সেবন করেন?
0,2023/11/07 3:26:01 PM GMT+9,২৫-৩৪,মহিলা,মাস্টার্স ডিগ্রি,অন্যান্য:,১-২ দিন,১-৩ ঘণ্টা,৩০ মিনিট - ১ ঘণ্টা,152.4,47.0,23:00,06:30,৬০ মিনিটের বেশি,"হ্যাঁ, ৩০ মিনিটের কম",৪-৬ ঘণ্টা,2,প্রায় সময়ই,না
1,2023/11/07 3:45:02 PM GMT+9,২৫-৩৪,মহিলা,মাস্টার্স ডিগ্রি,অন্যান্য:,৩-৪ দিন,৪-৬ ঘণ্টা,৩০ মিনিটের কম,,,00:00,08:30,১৫-৩০ মিনিট,"হ্যাঁ, ৩০ মিনিটের কম",৬ ঘণ্টা এবং তার বেশি,4,মাঝে মধ্যেই,না
2,2023/11/07 3:51:38 PM GMT+9,২৫-৩৪,মহিলা,ব্যাচলার ডিগ্রি,অন্যান্য:,০ দিন,৭ অথবা আরও ঘণ্টা,২ ঘণ্টা অথবা তার বেশি,153.4,68.0,00:30,09:30,১৫ মিনিটের কম,"না, আমি দিনে ঘুমায় না",৪-৬ ঘণ্টা,3,প্রায় সময়ই,না


Column Headers are too lengthy and hard to read. Let's rename them to English

In [380]:
# Define the new column names
new_column_names = {
    'Timestamp': 'Timestamp',
    'আপনার বয়স কত?*': 'Age Group',
    'আপনার লিঙ্গ কি? *': 'Gender',
    'আপনার শিক্ষাগত যোগ্যতা কি? *\n': 'Education Level',
    'আপনার পেশা কি? *': 'Occupation',
    'গড়ে সপ্তাহে কয়দিন আপনি কমপক্ষে ৩০ মিনিট বা তার চেয়ে বেশি সময়ের জন্য ব্যায়াম করেন? *': 'Exercise Days/Week',
    'গড়ে আপনি কত ঘণ্টা ইলেকট্রনিক ডিভাইস ব্যবহার করেন, যেমন স্মার্টফোন, ল্যাপটপ, টেলিভিশন, ইত্যাদি? ': 'Device Usage (hrs/day)',
    'গড়ে আপনি ঘুমানোর আগে ফোনে / ল্যপটপে কতটুকু সময় অতিবাহিত করেন?': 'Screen Time Before Sleep',
    '(ঐচ্ছিক) আপনার উচ্চতা কত? (সে.মি. এককে)': 'Height (cm)',
    '(ঐচ্ছিক) আপনার ওজন কত? (কেজি তে)': 'Weight (kg)',
    'রাতে সাধারনত কখন ঘুমাতে যান?': 'Bedtime',
    'সাধারনত আপনি সকালে কখন ঘুম থেকে উঠেন?': 'Wake-up Time',
    'সাধারনত শোয়ার কতক্ষনের মধ্যে আপনার ঘুম চলে আসে?': 'Sleep Onset Time',
    'আপনি কি দিনের বেলা সামান্য ঘুমিয়ে নেন? যদি উত্তর হ্যা হয়, তাহলে তা কতক্ষন?': 'Nap Duration',
    'সব মিলিয়ে গড়ে আপনি ২৪ ঘণ্টার মধ্যে কত ঘণ্টা ঘুমান?': 'Sleep Duration (hrs/24hr)',
    'আপনার সামগ্রীক ঘুমের মান কেমন থাকে? ': 'Sleep Quality',
    'আপনি কি পরিমান ঘুমের মধ্য বিঘ্নতা পরিলক্ষন করেন, যেমন রাতে ঘুম ভেঙ্গে যাওয়া বা খুব হালকা ঘুম?': 'Sleep Disturbances',
    'আপনি কি ঘুমের জন্য কোন ঔষধ সেবন করেন?': 'Sleep Medication',
}

# Rename the columns
dfs['Bengali'].rename(columns=new_column_names, inplace=True)

# Show the updated DataFrame
dfs['Bengali'].head()

Unnamed: 0,Timestamp,Age Group,Gender,Education Level,Occupation,Exercise Days/Week,Device Usage (hrs/day),Screen Time Before Sleep,Height (cm),Weight (kg),Bedtime,Wake-up Time,Sleep Onset Time,Nap Duration,Sleep Duration (hrs/24hr),Sleep Quality,Sleep Disturbances,Sleep Medication
0,2023/11/07 3:26:01 PM GMT+9,২৫-৩৪,মহিলা,মাস্টার্স ডিগ্রি,অন্যান্য:,১-২ দিন,১-৩ ঘণ্টা,৩০ মিনিট - ১ ঘণ্টা,152.4,47.0,23:00,06:30,৬০ মিনিটের বেশি,"হ্যাঁ, ৩০ মিনিটের কম",৪-৬ ঘণ্টা,2,প্রায় সময়ই,না
1,2023/11/07 3:45:02 PM GMT+9,২৫-৩৪,মহিলা,মাস্টার্স ডিগ্রি,অন্যান্য:,৩-৪ দিন,৪-৬ ঘণ্টা,৩০ মিনিটের কম,,,00:00,08:30,১৫-৩০ মিনিট,"হ্যাঁ, ৩০ মিনিটের কম",৬ ঘণ্টা এবং তার বেশি,4,মাঝে মধ্যেই,না
2,2023/11/07 3:51:38 PM GMT+9,২৫-৩৪,মহিলা,ব্যাচলার ডিগ্রি,অন্যান্য:,০ দিন,৭ অথবা আরও ঘণ্টা,২ ঘণ্টা অথবা তার বেশি,153.4,68.0,00:30,09:30,১৫ মিনিটের কম,"না, আমি দিনে ঘুমায় না",৪-৬ ঘণ্টা,3,প্রায় সময়ই,না
3,2023/11/07 3:53:05 PM GMT+9,২৫-৩৪,মহিলা,ডক্টরেট বা পেশাদার ডিগ্রি,পেশাদার / অফিস কর্মী,৫ বা আরও বেশি,৭ অথবা আরও ঘণ্টা,৩০ মিনিট - ১ ঘণ্টা,154.0,59.2,22:50,06:20,১৫-৩০ মিনিট,"না, আমি দিনে ঘুমায় না",৬ ঘণ্টা এবং তার বেশি,3,হঠাৎ হঠাৎ,না
4,2023/11/07 4:05:44 PM GMT+9,২৫-৩৪,পুরুষ,মাস্টার্স ডিগ্রি,ছাত্র,৫ বা আরও বেশি,১-৩ ঘণ্টা,৩০ মিনিটের কম,171.0,75.0,22:00,06:00,১৫-৩০ মিনিট,"না, আমি দিনে ঘুমায় না",৬ ঘণ্টা এবং তার বেশি,4,হঠাৎ হঠাৎ,না


### Translate Cell values to English

In [381]:
bengali_translation_dict = {
    "Gender": {"পুরুষ": "Male", "মহিলা": "Female", "অন্যান্য": "Other"},
    "Age Group": {
        "১৬-২৪": "16-24",
        "২৫-৩৪": "25-34",
        "৩৫-৪৪": "35-44",
        "৪৫-৫৪": "45-54",
        "৫৫+": "55+",
        "অন্যান্য": "Other",
    },
    "Education Level": {
        "মাস্টার্স ডিগ্রি": "Master's",
        "ব্যাচলার ডিগ্রি": "Bachelor's",
        "ডক্টরেট বা পেশাদার ডিগ্রি": "Doctorate/Prof.",
        "উচ্চ মাধ্যমিক": "High School",
    },
    "Occupation": {
        "অন্যান্য:": "Other",
        "পেশাদার / অফিস কর্মী": "Professional/Office Worker",
        "ছাত্র": "Student",
        "বেকার": "Unemployed",
        "পরিষেবা (বিক্রয়, খাদ্য পরিষেবা ইত্যাদি)": "Service",
    },
    "Exercise Days/Week": {
        "১-২ দিন": "1-2 Days",
        "৩-৪ দিন": "3-4 Days",
        "০ দিন": "0 Days",
        "৫ বা আরও বেশি": "5+ Days",
    },
    "Device Usage (hrs/day)": {
        "১-৩ ঘণ্টা": "1-3 Hours",
        "৪-৬ ঘণ্টা": "4-6 Hours",
        "৭ অথবা আরও ঘণ্টা": "7+ Hours",
    },
    "Screen Time Before Sleep": {
        "৩০ মিনিট - ১ ঘণ্টা": "30-60 Minutes",
        "৩০ মিনিটের কম": "<30 Minutes",
        "২ ঘণ্টা অথবা তার বেশি": "2+ Hours",
        "১-২ ঘণ্টা": "1-2 Hours",
    },
    "Sleep Onset Time": {
        "৬০ মিনিটের বেশি": ">60 Minutes",
        "১৫-৩০ মিনিট": "15-30 Minutes",
        "১৫ মিনিটের কম": "<15 Minutes",
        "৩০-৬০ মিনিট": "30-60 Minutes",
    },
    "Nap Duration": {
        "হ্যাঁ, ৩০ মিনিটের কম": "Yes, <30 mins",
        "না, আমি দিনে ঘুমায় না": "No Nap",
        "হ্যাঁ, ৩০-৬০ মিনিট": "Yes, 30-60 mins",
        "হ্যাঁ, ৯০ মিনিটের বেশি": "Yes, >90 mins",
    },
    "Sleep Duration (hrs/24hr)": {
        "৪-৬ ঘণ্টা": "4-6 Hours",
        "৬ ঘণ্টা এবং তার বেশি": "6+ Hours",
    },
    "Sleep Disturbances": {
        "প্রায় সময়ই": "Often",
        "মাঝে মধ্যেই": "Sometimes",
        "হঠাৎ হঠাৎ": "Rarely",
        "কখনও না": "Never",
    },
    "Sleep Medication": {"না": "No", "হ্যাঁ": "Yes"},
}


utils.translate_cells(dfs["Bengali"], bengali_translation_dict)
dfs["Bengali"]["Language"] = "Bengali"
dfs["Bengali"].sample(5)

Unnamed: 0,Timestamp,Age Group,Gender,Education Level,Occupation,Exercise Days/Week,Device Usage (hrs/day),Screen Time Before Sleep,Height (cm),Weight (kg),Bedtime,Wake-up Time,Sleep Onset Time,Nap Duration,Sleep Duration (hrs/24hr),Sleep Quality,Sleep Disturbances,Sleep Medication,Language
4,2023/11/07 4:05:44 PM GMT+9,25-34,Male,Master's,Student,5+ Days,1-3 Hours,<30 Minutes,171.0,75.0,22:00,06:00,15-30 Minutes,No Nap,6+ Hours,4,Rarely,No,Bengali
12,2023/11/07 6:19:23 PM GMT+9,45-54,Female,High School,Student,1-2 Days,4-6 Hours,30-60 Minutes,,43.0,00:00,06:00,15-30 Minutes,"Yes, 30-60 mins",4-6 Hours,3,Never,No,Bengali
9,2023/11/07 5:49:23 PM GMT+9,25-34,Female,Master's,Student,0 Days,1-3 Hours,<30 Minutes,160.0,53.0,12:00,07:00,15-30 Minutes,"Yes, 30-60 mins",6+ Hours,3,Rarely,No,Bengali
1,2023/11/07 3:45:02 PM GMT+9,25-34,Female,Master's,Other,3-4 Days,4-6 Hours,<30 Minutes,,,00:00,08:30,15-30 Minutes,"Yes, <30 mins",6+ Hours,4,Sometimes,No,Bengali
0,2023/11/07 3:26:01 PM GMT+9,25-34,Female,Master's,Other,1-2 Days,1-3 Hours,30-60 Minutes,152.4,47.0,23:00,06:30,>60 Minutes,"Yes, <30 mins",4-6 Hours,2,Often,No,Bengali


Now, the Bengali survey data is completely translated. Now, we'll do the same thing other the remaining languages.

## Vietnamese

### Before

In [382]:
dfs['Vietnamese'].sample(5)

Unnamed: 0,Timestamp,Bạn bao nhiêu tuổi?,Giới tính của bạn?,Trình độ học vấn cao nhất của bạn?,Nghề nghiệp của bạn?,Trung bình mỗi tuần bạn tập thể dục ít nhất 30 phút (hoặc dưới 30p nhưng cường độ cao) bao nhiêu ngày?,"Trung bình mỗi ngày bạn dùng các thiết bị điện tử như điện thoại thông minh, máy tính xách tay, ti vi, v.v. bao nhiêu giờ?",Trung bình bạn dành bao nhiêu thời gian sử dụng điện thoại/máy tính trước khi đi ngủ?,(Không bắt buộc) Chiều cao của bạn là bao nhiêu cm?,(Không bắt buộc) Cân nặng của bạn là bao nhiêu kg?,Thông thường bạn đi ngủ vào lúc mấy giờ tối?,Thông thường bạn thức dậy vào lúc mấy giờ sáng?,Thông thường bạn mất khoảng bao lâu để đi vào giấc ngủ đêm?,Thông thường giấc ngủ trưa của bạn kéo dài bao lâu?,Trung bình mỗi ngày bạn ngủ bao nhiêu giờ trong một ngày (24 giờ)?,Bạn đánh giá chất lượng giấc ngủ chung của mình như thế nào?,"Giấc ngủ của bạn có hay bị gián đoạn (vd: thức giấc nửa đêm, ngủ không yên) khong?",Bạn có sử dụng bất kỳ loại thuốc nào để hỗ trợ giấc ngủ không?
7,2023/11/07 1:28:45 PM GMT+9,25-34,Nữ,Tiến sĩ,Học sinh / Sinh viên,5 ngày trở lên,7 giờ trở lên,1-2 giờ,160.0,60.0,02:00,08:30,15-30 phút,Tôi không ngủ trưa,4-6 giờ,3,Thỉnh thoảng,Không
5,2023/11/07 12:07:55 PM GMT+9,16-24,Nam,THPT,Học sinh / Sinh viên,5 ngày trở lên,7 giờ trở lên,Ít hơn 30 phút,,,23:00,07:00,15-30 phút,30-60 phút,Hơn 6 giờ,4,Hiếm khi,Không
1,2023/11/07 11:42:16 AM GMT+9,16-24,Nam,Cử nhân,Học sinh / Sinh viên,1-2 ngày,7 giờ trở lên,Hơn 2 giờ,175.0,65.0,12:15,08:00,30-60 phút,60-90 phút,Hơn 6 giờ,4,Thỉnh thoảng,Không
13,2023/11/07 7:08:39 PM GMT+9,25-34,Nữ,Cử nhân,Học sinh / Sinh viên,1-2 ngày,4-6 giờ,1-2 giờ,155.0,44.0,12:00,08:00,15-30 phút,Tôi không ngủ trưa,Hơn 6 giờ,3,Thỉnh thoảng,Không
0,2023/11/07 11:38:56 AM GMT+9,25-34,Nữ,Thạc sĩ,Chuyên nghiệp/văn phòng,3-4 ngày,4-6 giờ,1-2 giờ,,,11:00,06:00,Ít hơn 15 phút,Tôi không ngủ trưa,Hơn 6 giờ,4,Hiếm khi,Không


### After

In [383]:
# Translate columns to English
vi_headers_dict = utils.read_json('translation/vi_header.json')

dfs['Vietnamese'].rename(columns=vi_headers_dict, inplace=True)

# Translate cell values to English
vi_cells_dict = utils.read_json('translation/vi_val.json')
utils.translate_cells(dfs["Vietnamese"], vi_cells_dict)

dfs["Vietnamese"]["Language"] = "Vietnamese"
dfs['Vietnamese'].sample(5)

Unnamed: 0,Timestamp,Age Group,Gender,Education Level,Occupation,Exercise Days/Week,Device Usage (hrs/day),Screen Time Before Sleep,Height (cm),Weight (kg),Bedtime,Wake-up Time,Sleep Onset Time,Nap Duration,Sleep Duration (hrs/24hr),Sleep Quality,Sleep Disturbances,Sleep Medication,Language
5,2023/11/07 12:07:55 PM GMT+9,16-24,Male,High School,Student,5+ Days,7+ Hours,<30 Minutes,,,23:00,07:00,15-30 Minutes,30-60 Minutes,6+ Hours,4,Rarely,No,Vietnamese
2,2023/11/07 11:48:19 AM GMT+9,25-34,Female,Master's,Teacher,0 Days,4-6 Hours,1-2 Hours,160.0,53.0,01:00,06:00,15-30 Minutes,No Nap,4-6 Hours,5,Never,No,Vietnamese
12,2023/11/07 5:18:07 PM GMT+9,25-34,Female,Doctorate,Student,1-2 Days,4-6 Hours,<30 Minutes,,,21:30,06:30,15-30 Minutes,No Nap,6+ Hours,3,Rarely,No,Vietnamese
9,2023/11/07 2:22:13 PM GMT+9,25-34,Female,Master's,Student,0 Days,1-3 Hours,30-60 Minutes,155.0,,02:00,09:00,<15 Minutes,No Nap,6+ Hours,4,Never,No,Vietnamese
15,2023/11/08 9:06:41 AM GMT+9,25-34,Female,Doctorate,Researcher,0 Days,4-6 Hours,30-60 Minutes,154.0,44.0,01:00,08:00,15-30 Minutes,No Nap,4-6 Hours,2,Sometimes,No,Vietnamese


## English

### Before

In [384]:
dfs["English"].sample(5)

Unnamed: 0,Timestamp,What is your age group?,What is your gender?,What is the highest level of education you have completed?,Which category best describes your occupation?,"On average, how many days per week do you exercise for at least 30 (or under 30 mins but high intensity) minutes?","On average, how many hours per day do you use electronic devices such as smartphones, laptops, televisions, etc.?","On average, how much time do you usually spend on phone / computers before sleep?",(Optional) What is your height in centimeters?,(Optional) What is your weight in kilograms?,"On average, what time do you typically go to bed at night?","On average, what time do you typically wake up in the morning?","On average, how long does it take you to fall asleep at night?","On average, how long is your typical daytime nap?","On average, how many hours do you sleep per 24-hour period?",How would you rate your overall sleep quality?,How often do you experience sleep disturbances such as waking up during the night or having restless sleep?,Do you take any medication to help you sleep?
6,2023/11/07 5:12:22 PM GMT+9,25-34,Male,Bachelor's degree,Student,3-4 days,7 or more hours,More than 2 hours,167.0,60.0,11:00,07:00,15-30 minutes,"No, I do not nap during the day",More than 6 hours,4,Rarely,No
7,2023/11/07 5:12:34 PM GMT+9,25-34,Male,Bachelor's degree,Student,5 or more days,4-6 hours,Less than 30 minutes,172.0,71.0,23:30,06:30,15-30 minutes,"Yes, less than 30 minutes",More than 6 hours,3,Rarely,No
67,2023/11/13 7:10:06 PM GMT+9,25-34,Male,Bachelor's degree,Professional/office job,1-2 days,4-6 hours,30 minutes - 1 hour,178.0,70.0,00:00,07:00,Less than 15 minutes,"No, I do not nap during the day",4-6 hours,4,Rarely,No
15,2023/11/07 6:59:34 PM GMT+9,25-34,Male,Bachelor's degree,Student,3-4 days,4-6 hours,Less than 30 minutes,170.74,67.0,23:30,06:00,Less than 15 minutes,"No, I do not nap during the day",More than 6 hours,4,Rarely,No
48,2023/11/09 11:15:45 AM GMT+9,35-44,Male,Bachelor's degree,Professional/office job,1-2 days,1-3 hours,30 minutes - 1 hour,167.0,80.0,10:30,05:00,15-30 minutes,"Yes, less than 30 minutes",4-6 hours,4,Rarely,No


### After

Even though the original survey data is in English, there are two problems:
- Column headers contain long text, which decreases readablity.
- Cell values also include long text, and unstandardized.

In [385]:
# Shorten column header texts
en_headers_dict = utils.read_json('translation/en_header.json')
dfs['English'].rename(columns=en_headers_dict, inplace=True)

# Translate cell values to English
en_cells_dict = utils.read_json('translation/en_val.json')
utils.translate_cells(dfs["English"], en_cells_dict)
dfs["English"]["Language"] = "English"
dfs["English"].sample(5)


Unnamed: 0,Timestamp,Age Group,Gender,Education Level,Occupation,Exercise Days/Week,Device Usage (hrs/day),Screen Time Before Sleep,Height (cm),Weight (kg),Bedtime,Wake-up Time,Sleep Onset Time,Nap Duration,Sleep Duration (hrs/24hr),Sleep Quality,Sleep Disturbances,Sleep Medication,Language
51,2023/11/09 12:00:26 PM GMT+9,35-44,Male,Bachelor's,Professional/Office Worker,0 Days,1-3 Hours,<30 Minutes,,70.0,20:30,05:00,<15 Minutes,30-60 mins,6+ Hours,4,Never,No,English
54,2023/11/09 1:18:51 PM GMT+9,35-44,Female,Master's,Professional/Office Worker,1-2 Days,4-6 Hours,2+ Hours,150.0,70.0,22:00,06:00,15-30 Minutes,<30 mins,6+ Hours,5,Rarely,No,English
59,2023/11/09 1:48:48 PM GMT+9,25-34,Male,Doctorate,Professional/Office Worker,5+ Days,4-6 Hours,1-2 Hours,180.0,85.0,23:00,06:30,<15 Minutes,No Nap,6+ Hours,4,Rarely,No,English
55,2023/11/09 1:41:14 PM GMT+9,35-44,Male,Master's,Student,0 Days,7+ Hours,30-60 Minutes,,72.0,13:30,07:30,15-30 Minutes,30-60 mins,4-6 Hours,2,Frequently,No,English
37,2023/11/08 7:00:15 PM GMT+9,45-54,Male,Doctorate,Professional/Office Worker,3-4 Days,7+ Hours,2+ Hours,180.0,83.0,23:30,07:00,15-30 Minutes,No Nap,6+ Hours,2,Frequently,No,English


## Korean

In [386]:
# Shorten column header texts
kr_headers_dict = utils.read_json('translation/kr_header.json')
dfs['Korean'].rename(columns=kr_headers_dict, inplace=True)

# Translate cell values to English
kr_cells_dict = utils.read_json('translation/kr_val.json')
utils.translate_cells(dfs["Korean"], kr_cells_dict)
dfs["Korean"]["Language"] = "Korean"
dfs["Korean"]

Unnamed: 0,Timestamp,Age Group,Gender,Education Level,Occupation,Exercise Days/Week,Device Usage (hrs/day),Screen Time Before Sleep,Height (cm),Weight (kg),Bedtime,Wake-up Time,Sleep Onset Time,Nap Duration,Sleep Duration (hrs/24hr),Sleep Quality,Sleep Disturbances,Sleep Medication,Language
0,2023/11/07 11:29:18 AM GMT+9,16-24,Female,High School,Student,1-2 Days,7+ Hours,1-2 Hours,167,60,01:00,08:30,<15 Minutes,No Nap,6+ Hours,,Sometimes,No,Korean


# Merge

In [388]:
df_merge = pd.concat(dfs.values())
df_merge.to_csv('data/all.csv')
df_merge.sample(10)

Unnamed: 0,Timestamp,Age Group,Gender,Education Level,Occupation,Exercise Days/Week,Device Usage (hrs/day),Screen Time Before Sleep,Height (cm),Weight (kg),Bedtime,Wake-up Time,Sleep Onset Time,Nap Duration,Sleep Duration (hrs/24hr),Sleep Quality,Sleep Disturbances,Sleep Medication,Language
14,2023/11/07 6:19:46 PM GMT+9,25-34,Male,Master's,Professional/Office Worker,3-4 Days,1-3 Hours,1-2 Hours,158.0,64.0,23:30,06:00,15-30 Minutes,No Nap,4-6 Hours,3,Sometimes,No,English
2,2023/11/07 11:48:19 AM GMT+9,25-34,Female,Master's,Teacher,0 Days,4-6 Hours,1-2 Hours,160.0,53.0,01:00,06:00,15-30 Minutes,No Nap,4-6 Hours,5,Never,No,Vietnamese
50,2023/11/09 11:58:24 AM GMT+9,16-24,Male,High School,Student,3-4 Days,1-3 Hours,30-60 Minutes,174.0,79.0,01:00,07:00,<15 Minutes,30-60 mins,4-6 Hours,2,Rarely,No,English
21,2023/11/07 9:47:14 PM GMT+9,25-34,Male,Bachelor's,Student,5+ Days,7+ Hours,2+ Hours,167.5,66.0,04:00,23:00,30-60 Minutes,<30 mins,6+ Hours,2,Frequently,Yes,English
49,2023/11/09 11:56:29 AM GMT+9,25-34,Female,Bachelor's,Professional/Office Worker,1-2 Days,7+ Hours,30-60 Minutes,160.0,99.0,09:30,04:50,15-30 Minutes,60-90 mins,4-6 Hours,4,Sometimes,No,English
34,2023/11/08 1:49:11 PM GMT+9,25-34,Female,Master's,Professional/Office Worker,1-2 Days,7+ Hours,2+ Hours,170.0,65.0,22:00,05:00,15-30 Minutes,No Nap,4-6 Hours,4,Rarely,No,English
14,2023/11/08 12:34:45 AM GMT+9,25-34,Female,Bachelor's,Professional/Office Worker,1-2 Days,7+ Hours,30-60 Minutes,150.0,45.0,00:00,07:30,30-60 Minutes,<30 Minutes,6+ Hours,3,Rarely,No,Vietnamese
22,2023/11/07 10:06:46 PM GMT+9,25-34,Male,Master's,Professional/Office Worker,1-2 Days,1-3 Hours,30-60 Minutes,170.0,100.0,11:00,07:00,30-60 Minutes,<30 mins,6+ Hours,4,Sometimes,No,English
10,2023/11/07 4:45:50 PM GMT+9,25-34,Female,Doctorate,Professional/Office Worker,1-2 Days,4-6 Hours,1-2 Hours,152.0,,00:00,08:30,15-30 Minutes,No Nap,6+ Hours,4,Rarely,No,Vietnamese
41,2023/11/08 8:23:26 PM GMT+9,25-34,Male,Master's,Professional/Office Worker,3-4 Days,7+ Hours,<30 Minutes,185.0,86.0,23:00,07:30,<15 Minutes,No Nap,6+ Hours,5,Never,No,English
