# Importing libraries and reading the data

In [None]:
import pandas as pd
import numpy as np
import datetime

In [None]:
# Assign the raw dataset to be processed

df = pd.read_csv('/content/4_results-survey821695.csv')

In [None]:
df.head

<bound method NDFrame.head of     id GENDER  GENDER[other]   AGE NATIVETR BILINGUALQ  REGION[THRACE]  \
0    1  FEMAL            NaN  2000    YESTR      MONOL             NaN   
1    2  FEMAL            NaN  1996    YESTR       BIL1             NaN   
2    3  FEMAL            NaN  1999    YESTR      MONOL             NaN   
3    5  FEMAL            NaN  2003    YESTR      MONOL             NaN   
4    7   MALE            NaN  2001    YESTR      MONOL             NaN   
5    8   MALE            NaN  1994    YESTR      MONOL             NaN   
6    9  FEMAL            NaN  1996    YESTR       BIL3             NaN   
7   10  FEMAL            NaN  1998    YESTR       BIL1             NaN   
8   12  FEMAL            NaN  2000    YESTR       BIL1             NaN   
9   13  FEMAL            NaN  1974    YESTR      MONOL             NaN   
10  15  FEMAL            NaN  1971    YESTR      MONOL             NaN   
11  17   MALE            NaN  2001    YESTR      MONOL             NaN   
12  18  

# Basic preprocessing functions

The following functions do the following cleaning operations:


*   Clean the name of the columns
*   Remove irrelevant columns (seed, token etc.)
*   Remove non-native speakers
*   Reduce region columns into one with categorical data
*   Calculate age (based on birthyear)
*   Compile all these cleaning operations







In [None]:
def col_clean(df, strings_to_eliminate, string_to_remove_from_column_names):
    # Create a copy of the input DataFrame
    output_df = df.copy()
    # Create a list of the columns to keep
    columns_to_keep = [column for column in output_df.columns if all(string not in column for string in strings_to_eliminate)]
    # Keep only the columns in the list
    output_df = output_df[columns_to_keep]
    # Get the new column names by removing the string from the old names
    new_column_names = [column.replace(string_to_remove_from_column_names, '') for column in output_df.columns]
    # Assign the new column names to the DataFrame
    output_df.columns = new_column_names
    return output_df

In [None]:
def filter_nonnative(df):
  # Create a copy of the input DataFrame
  new_df = df.copy()

  # Iterate over the rows of the DataFrame
  for i, row in df.iterrows():
    # Check the value of the "NATIVETR" column
    if row["NATIVETR"] != "YESTR":
      # If it's not "YESTR", drop the row from the new DataFrame
      new_df.drop(i, inplace=True)

  # Return the new DataFrame
  return new_df

In [None]:
def assign_region(df):
  # Make a copy of the dataframe
  df_copy = df.copy()

  # Create an empty list to store the names of the REGION[X] columns
  region_columns = []

  # Iterate over the column names in the dataframe
  for col in df_copy.columns:
    # If the column name matches the schematic format "REGION[X]", append it to the list
    if "REGION" in col:
      region_columns.append(col)

  # Add a new column named "Region" to the dataframe
  df_copy.insert(6, "Region", "")

  # Iterate over the rows in the dataframe
  for index, row in df_copy.iterrows():
    # Iterate over the REGION[X] columns
    for col in region_columns:
      # If the value in the current column is "Y", assign the region name (X) to the "Region" column for the current row
      if row[col] == "Y":
        df_copy.at[index, "Region"] = col.split("[")[1].split("]")[0]
        break

  # Drop the REGION[X] columns
  df_copy.drop(columns=region_columns, inplace=True)

  # Return the modified dataframe
  return df_copy

In [None]:
def calculate_age(df):
  # Make a copy of the dataframe
  df_copy = df.copy()

  # Get the current year
  current_year = datetime.datetime.now().year

  # Iterate over the rows in the dataframe
  for index, row in df_copy.iterrows():
    # Calculate the age based on the current year and the birth year
    age = current_year - row["AGE"]

    # Assign the calculated age to the "AGE" column
    df_copy.at[index, "AGE"] = age

  # Return the modified dataframe
  return df_copy

In [None]:
def assign_major(row):
    if row['EDUCATIONMAJOR[SOCIALSCIENCES]'] == 'Y':
        return 'SS'
    elif row['EDUCATIONMAJOR[PHYSICALSCIENCES]'] == 'Y':
        return 'PS'
    elif row['EDUCATIONMAJOR[HUMANITIES]'] == 'Y':
        return 'HU'
    elif row['EDUCATIONMAJOR[FINEARTS]'] == 'Y':
        return 'FA'
    else:
        return None

In [None]:
def preprocessor(df):
  new_df = df.copy()
  eliminate_column_names = ['Time', 'INFO', 'DISPLAY', 'startlanguage', 'submitdate', 'lastpage', 'seed', 'token']
  new_df = col_clean(new_df,
                     strings_to_eliminate = eliminate_column_names,
                     string_to_remove_from_column_names = '[PRACTICE1]')

  new_df = filter_nonnative(new_df)
  new_df = assign_region(new_df)
  new_df = calculate_age(new_df)

  return new_df

# **More sophisticated operations**

*Here I check the control sentences and the response time and try to decide whether I should eliminate the participant's data from the experiment.*

## Control Sentences

In [None]:
def ctungram_check(df, threshold):
  # Create a copy of the original DataFrame
  new_df = df.copy()

  # Create a new column called "ctungram_check" and fill it with 0
  new_df["ctungram_check"] = 0

  # Create a new column called "ctungram_mean" and fill it with 0
  new_df["ctungram_mean"] = 0

  # Iterate over the rows of the DataFrame
  for index, row in new_df.iterrows():
    # Create an empty list to store the values from the "CTUNGRAM" columns
    values = []

    # Iterate over the columns in the row
    for col in row.index:
      # Check if the column name contains "CTUNGRAM"
      if "CTUNGRAM" in col:
        # If it does, append the value to the list
        values.append(row[col])

    # Check if the list is empty
    if values:
      # Calculate the mean of the values in the list
      mean = sum(values) / len(values)

      # Update the "ctungram_check" column with 0 or 1 depending on whether the mean is lower than the threshold
      new_df.loc[index, "ctungram_check"] = 0 if mean > threshold else 1

      # Update the "ctungram_mean" column with the mean value
      new_df.loc[index, "ctungram_mean"] = mean

  # Return the new DataFrame
  return new_df

In [None]:
def ctgram_check(df, threshold):
  # Create a copy of the original DataFrame
  new_df = df.copy()

  # Create a new column called "ctgram_check" and fill it with 0
  new_df["ctgram_check"] = 0

  # Create a new column called "ctgram_mean" and fill it with 0
  new_df["ctgram_mean"] = 0

  # Iterate over the rows of the DataFrame
  for index, row in new_df.iterrows():
    # Create an empty list to store the values from the "CTGRAM" columns
    values = []

    # Iterate over the columns in the row
    for col in row.index:
      # Check if the column name contains "CTGRAM"
      if "CTGRAM" in col:
        # If it does, append the value to the list
        values.append(row[col])

    # Calculate the mean of the values in the list
    mean = sum(values) / len(values)

    # Update the "ctgram_check" column with 0 or 1 depending on whether the mean is lower than the threshold
    new_df.loc[index, "ctgram_check"] = 0 if mean < threshold else 1

    # Update the "ctgram_mean" column with the mean value
    new_df.loc[index, "ctgram_mean"] = mean

  # Return the new DataFrame
  return new_df

## Compiling basic statistics

The following function tries to compile basic statistics for each condition.

In [None]:
def calculate_statistics_for_columns_with_string(df, string_to_match):
    # Create a copy of the input DataFrame
    output_df = df.copy()
    # Iterate over the rows of the DataFrame
    for i, row in output_df.iterrows():
        # Get the data for the row
        data = row.values
        # Get the indices of the columns that contain the string
        indices = [i for i, column in enumerate(output_df.columns) if string_to_match in column]
        # Get the data for the columns that contain the string
        data_to_analyze = data[indices]
        # Calculate the statistics
        mean = data_to_analyze.mean()
        median = np.median(data_to_analyze)
        if len(data_to_analyze) <= 1:
            stddev = np.nan
            variance = np.nan
        else:
            stddev = data_to_analyze.std()
            variance = data_to_analyze.var()
        # Update the output DataFrame with the calculated statistics
        output_df.loc[i, 'mean'] = mean
        output_df.loc[i, 'median'] = median
        output_df.loc[i, 'stddev'] = stddev
        output_df.loc[i, 'variance'] = variance

    return output_df

# Execution

This step executes all the functions to yield the ultimate, cleaned dataset for further statistical analysis.

In [None]:
# Basic preprocessing functions

df_preprocessed = preprocessor(df)

In [None]:
df_preprocessed['MAJOR'] = df_preprocessed.apply(assign_major, axis=1)

# insert the 'MAJOR' column as the 9th column in the DataFrame
df_preprocessed.insert(8, 'MAJOR', df_preprocessed.pop('MAJOR'))

In [None]:
df_preprocessed = ctgram_check(df_preprocessed, 1.5)
df_preprocessed = ctungram_check(df_preprocessed, -1.5)
df_preprocessed

Unnamed: 0,id,GENDER,GENDER[other],AGE,NATIVETR,BILINGUALQ,Region,L2,MAJOR,EDUCATION,...,UCASEU10,CTUNGRAM10,LCASEM12,CTGRAM11,CTUNGRAM11,interviewtime,ctgram_check,ctgram_mean,ctungram_check,ctungram_mean
0,1,FEMAL,,23,YESTR,MONOL,MARMARA,İngilizce- 2,SS,BA,...,-1,-1,3,2,1,1009.04,1,2.545455,0,-1.363636
1,2,FEMAL,,27,YESTR,BIL1,MARMARA,Zazaca 2\nİngilizce 1,SS,MA,...,1,-3,3,3,2,791.09,1,2.181818,1,-2.090909
2,3,FEMAL,,24,YESTR,MONOL,MARMARA,,SS,MA,...,1,-3,3,3,-3,686.35,1,2.272727,1,-2.454545
3,5,FEMAL,,20,YESTR,MONOL,MARMARA,İngilizce 2,SS,BA,...,-1,3,2,3,3,455.27,1,1.636364,0,1.272727
4,7,MALE,,22,YESTR,MONOL,AEGEAN,ingilizce-2,SS,BA,...,-3,-3,1,3,-1,439.81,1,1.909091,1,-2.181818
5,8,MALE,,29,YESTR,MONOL,AEGEAN,,SS,BA,...,3,-3,3,3,1,886.09,1,1.818182,1,-2.636364
6,9,FEMAL,,27,YESTR,BIL3,CENTRALANATOLIA,,SS,BA,...,2,-1,3,3,-1,574.62,1,1.545455,1,-2.272727
7,10,FEMAL,,25,YESTR,BIL1,MEDITER,Orta derecede ingilizce biliyorum,SS,BA,...,-1,-3,1,2,-1,796.77,0,1.454545,1,-2.454545
8,12,FEMAL,,23,YESTR,BIL1,EASTANATOLIA,,SS,BA,...,-3,-3,1,2,-3,663.89,0,0.363636,1,-2.545455
9,13,FEMAL,,49,YESTR,MONOL,MARMARA,İngilizce 2,PS,BA,...,-3,-1,2,3,-2,422.58,0,1.181818,1,-2.090909


In [None]:
df_preprocessed.loc[df['EDUCATION'] == 'LISE', 'EDUCATION'] = 'LYC'
df_preprocessed = df_preprocessed.drop('EDUCATION[other]', axis=1)
df_preprocessed = df_preprocessed.drop('GENDER[other]', axis=1)

In [None]:
df_preprocessed

Unnamed: 0,id,GENDER,AGE,NATIVETR,BILINGUALQ,Region,L2,MAJOR,EDUCATION,EDUCATIONMAJOR[FINEARTS],...,UCASEU10,CTUNGRAM10,LCASEM12,CTGRAM11,CTUNGRAM11,interviewtime,ctgram_check,ctgram_mean,ctungram_check,ctungram_mean
0,1,FEMAL,23,YESTR,MONOL,MARMARA,İngilizce- 2,SS,BA,,...,-1,-1,3,2,1,1009.04,1,2.545455,0,-1.363636
1,2,FEMAL,27,YESTR,BIL1,MARMARA,Zazaca 2\nİngilizce 1,SS,MA,,...,1,-3,3,3,2,791.09,1,2.181818,1,-2.090909
2,3,FEMAL,24,YESTR,MONOL,MARMARA,,SS,MA,,...,1,-3,3,3,-3,686.35,1,2.272727,1,-2.454545
3,5,FEMAL,20,YESTR,MONOL,MARMARA,İngilizce 2,SS,BA,,...,-1,3,2,3,3,455.27,1,1.636364,0,1.272727
4,7,MALE,22,YESTR,MONOL,AEGEAN,ingilizce-2,SS,BA,,...,-3,-3,1,3,-1,439.81,1,1.909091,1,-2.181818
5,8,MALE,29,YESTR,MONOL,AEGEAN,,SS,BA,,...,3,-3,3,3,1,886.09,1,1.818182,1,-2.636364
6,9,FEMAL,27,YESTR,BIL3,CENTRALANATOLIA,,SS,BA,,...,2,-1,3,3,-1,574.62,1,1.545455,1,-2.272727
7,10,FEMAL,25,YESTR,BIL1,MEDITER,Orta derecede ingilizce biliyorum,SS,BA,,...,-1,-3,1,2,-1,796.77,0,1.454545,1,-2.454545
8,12,FEMAL,23,YESTR,BIL1,EASTANATOLIA,,SS,BA,,...,-3,-3,1,2,-3,663.89,0,0.363636,1,-2.545455
9,13,FEMAL,49,YESTR,MONOL,MARMARA,İngilizce 2,PS,BA,,...,-3,-1,2,3,-2,422.58,0,1.181818,1,-2.090909


In [None]:
df_preprocessed.to_csv('Anket4_preprocessed.csv', index=False)

In [None]:
df_test = pd.read_csv('/content/Anket4_preprocessed.csv')

In [None]:
df_test

Unnamed: 0,id,GENDER,AGE,NATIVETR,BILINGUALQ,Region,L2,MAJOR,EDUCATION,EDUCATIONMAJOR[FINEARTS],...,UCASEU10,CTUNGRAM10,LCASEM12,CTGRAM11,CTUNGRAM11,interviewtime,ctgram_check,ctgram_mean,ctungram_check,ctungram_mean
0,1,FEMAL,23,YESTR,MONOL,MARMARA,İngilizce- 2,SS,BA,,...,-1,-1,3,2,1,1009.04,1,2.545455,0,-1.363636
1,2,FEMAL,27,YESTR,BIL1,MARMARA,Zazaca 2\nİngilizce 1,SS,MA,,...,1,-3,3,3,2,791.09,1,2.181818,1,-2.090909
2,3,FEMAL,24,YESTR,MONOL,MARMARA,,SS,MA,,...,1,-3,3,3,-3,686.35,1,2.272727,1,-2.454545
3,5,FEMAL,20,YESTR,MONOL,MARMARA,İngilizce 2,SS,BA,,...,-1,3,2,3,3,455.27,1,1.636364,0,1.272727
4,7,MALE,22,YESTR,MONOL,AEGEAN,ingilizce-2,SS,BA,,...,-3,-3,1,3,-1,439.81,1,1.909091,1,-2.181818
5,8,MALE,29,YESTR,MONOL,AEGEAN,,SS,BA,,...,3,-3,3,3,1,886.09,1,1.818182,1,-2.636364
6,9,FEMAL,27,YESTR,BIL3,CENTRALANATOLIA,,SS,BA,,...,2,-1,3,3,-1,574.62,1,1.545455,1,-2.272727
7,10,FEMAL,25,YESTR,BIL1,MEDITER,Orta derecede ingilizce biliyorum,SS,BA,,...,-1,-3,1,2,-1,796.77,0,1.454545,1,-2.454545
8,12,FEMAL,23,YESTR,BIL1,EASTANATOLIA,,SS,BA,,...,-3,-3,1,2,-3,663.89,0,0.363636,1,-2.545455
9,13,FEMAL,49,YESTR,MONOL,MARMARA,İngilizce 2,PS,BA,,...,-3,-1,2,3,-2,422.58,0,1.181818,1,-2.090909
