In [36]:
from pathlib import Path

import pandas as pd

# For notebook, use:
ROOT_DIR = Path.cwd().parent

In [37]:
pd.set_option("display.max_columns", 85)
pd.set_option("display.max_rows", 85)

In [38]:
# Load your CSV
df = pd.read_csv(ROOT_DIR / "data" / "stackoverflow_survey" / "2019" / "survey_results_public.csv")
schema_df = pd.read_csv(ROOT_DIR / "data" / "stackoverflow_survey" / "2019" / "survey_results_schema.csv")

In [None]:
# head takes an int to say how many rows to display.
df.head() # first 5

In [None]:
df.tail() # last 5

In [None]:
# display how many rows and columns make up the data
df.shape

In [None]:
# displays number of rows and columns in data, and additionally datatypes of the columns
df.info()

In [None]:
# show the column names
df.columns

In [None]:
schema_df

In [None]:
# get column
df["Hobbyist"]

In [None]:
# get multiple columns
df[["Hobbyist", "MainBranch"]]

In [None]:
# get first row
df.iloc[0]

In [None]:
# get multiple rows
df.iloc[[0,1]]

In [None]:
# get first row but based on a lable.  By default if no lables are set they default to numeric range
df.loc[0]

In [None]:
# get multiple rows
df.loc[[0,1]]

In [None]:
# select row 1 and 2, but only display column 4
df.iloc[[0,1],3]

In [None]:
# using loc now, so for colums need to use proper label for it and not an int
df.loc[[0,1], ["OpenSourcer","OpenSource"]]

In [None]:
df["Hobbyist"]

In [None]:
df.loc[0, "Hobbyist"]

In [None]:
# get first 3 responses for hobbiyst column
df.loc[[0,1,2],"Hobbyist"]

In [None]:
# slicing
# NOTE when slicing the end range is INCLUSIVE
# 0:2 means from 0 to 2 (and not to 1 as in normal ranges)
# NOTE no brackets needed around slicing
df.loc[0:2, "Hobbyist"]

In [None]:
# slicing on the columns
df.loc[0:2, "Hobbyist":"Employment"]

In [58]:
# convert a dict to a dataframe
people = {
    "first":["Corey", "Jane", "John"],
    "last": ["Schafer", "Doe", "Doe"],
    "email": ["CoreySchafer@email.com","JaneDoe@email.com","JohnDoe@email.com"]
}

dict_df = pd.DataFrame(people)

In [None]:
dict_df

In [None]:
# set the email column as index for this dataframe
# this only sets the index temporiraly.
# to set permanently
dict_df.set_index("email")

In [None]:
dict_df.set_index("email").index

In [None]:
# now that we have selected an index we can use them instead of the numerical default indexes like 0,1,2 ect when using loc
dict_df.set_index("email").loc["JaneDoe@email.com", "first"]

In [63]:
#if you set and index and mistakinly mutate the dataframe using inplace you can reset the index
dict_df.reset_index(inplace=True)

In [None]:
df.columns

In [35]:
# index can also be set while creating the dataframe
indexed_df = pd.read_csv(ROOT_DIR / "data" / "stackoverflow_survey" / "2019" / "survey_results_public.csv", index_col="Respondent")
indexed_schema_df = pd.read_csv(ROOT_DIR / "data" / "stackoverflow_survey" / "2019" / "survey_results_schema.csv", index_col="Column")

In [None]:
indexed_df.head()

In [None]:
indexed_df.loc[1]

In [None]:
indexed_schema_df

In [None]:
indexed_schema_df.loc["Hobbyist"]

In [None]:
indexed_schema_df.loc["MgrIdiot", "QuestionText"]

In [71]:
# sort the index
indexed_schema_df.sort_index(ascending=True)

Unnamed: 0_level_0,QuestionText
Column,Unnamed: 1_level_1
Age,What is your age (in years)? If you prefer not...
Age1stCode,At what age did you write your first line of c...
BetterLife,Do you think people born today will have a bet...
BlockchainIs,Blockchain / cryptocurrency technology is prim...
BlockchainOrg,How is your organization thinking about or imp...
CareerSat,"Overall, how satisfied are you with your caree..."
CodeRev,Do you review code as part of your work?
CodeRevHrs,"On average, how many hours per week do you spe..."
CompFreq,"Is that compensation weekly, monthly, or yearly?"
CompTotal,What is your current total compensation (salar...
