In [1]:
import pandas as pd

# Equivalent of simple lists
x1 = [list(range(1, 5)), "a", True]
print(x1)

x2 = {'a': list(range(1, 3)), 'b': list(range(1, 4)), 'c': list(range(1, 5))}
print(x2)

# Equivalent of c() in R for combining vectors
combined = [1, 2] + [3, 4]  # This combines the lists into one list
print(combined)

# Nested lists
x3 = [[1, 2], [3, 4]]
x4 = [1, 2] + [3, 4]  # This is just concatenating the lists
x5 = [1, [2, [3, [4, [5]]]]]

# DataFrames and nesting
df = pd.DataFrame({
    'x': [1, 2],
    'y': ["a", "b"],
    'z': [[1, 2], [3, 4, 5]]
})
print(df)

# More complex DataFrame structures
df1 = pd.DataFrame({
    'x': [1, 2, 3],
    'y': [{'a': 11, 'b': 12}, {'a': 21, 'b': 22}, {'a': 31, 'b': 32}]
})

df2 = pd.DataFrame({
    'x': [1, 2, 3],
    'y': [[11, 12, 13], [21], [31, 32]]
})

# Unnesting data frames
# In Python, unnesting can be achieved using 'explode' for lists and 'json_normalize' for dictionaries.
df1_exploded = pd.json_normalize(df1['y']).join(df1['x'])
print(df1_exploded)

df2_exploded = df2.explode('y')
print(df2_exploded)

# Tribble equivalent in Python is simply constructing a DataFrame directly.
df6 = pd.DataFrame({
    'x': ["a", "b", "c"],
    'y': [[1, 2], [3], []]
})
df6_exploded = df6.explode('y')
print(df6_exploded)

df4 = pd.DataFrame({
    'x': ["a", "b"],
    'y': [[1], ["a", True, 5]]
})
df4_exploded = df4.explode('y')
print(df4_exploded)

# For the last example, you would explode both lists similarly
df4 = pd.DataFrame({
    'x': ["a", "b"],
    'y': [["y-a-1", "y-a-2"], ["y-b-1", "y-b-2", "y-b-3"]],
    'z': [["z-a-1", "z-a-2"], ["z-b-1", "z-b-2", "z-b-3"]]
})
df4_exploded_y = df4.explode('y')
df4_exploded_z = df4.explode('z')
print(df4_exploded_y)
print(df4_exploded_z)

[[1, 2, 3, 4], 'a', True]
{'a': [1, 2], 'b': [1, 2, 3], 'c': [1, 2, 3, 4]}
[1, 2, 3, 4]
   x  y          z
0  1  a     [1, 2]
1  2  b  [3, 4, 5]
    a   b  x
0  11  12  1
1  21  22  2
2  31  32  3
   x   y
0  1  11
0  1  12
0  1  13
1  2  21
2  3  31
2  3  32
   x    y
0  a    1
0  a    2
1  b    3
2  c  NaN
   x     y
0  a     1
1  b     a
1  b  True
1  b     5
   x      y                      z
0  a  y-a-1         [z-a-1, z-a-2]
0  a  y-a-2         [z-a-1, z-a-2]
1  b  y-b-1  [z-b-1, z-b-2, z-b-3]
1  b  y-b-2  [z-b-1, z-b-2, z-b-3]
1  b  y-b-3  [z-b-1, z-b-2, z-b-3]
   x                      y      z
0  a         [y-a-1, y-a-2]  z-a-1
0  a         [y-a-1, y-a-2]  z-a-2
1  b  [y-b-1, y-b-2, y-b-3]  z-b-1
1  b  [y-b-1, y-b-2, y-b-3]  z-b-2
1  b  [y-b-1, y-b-2, y-b-3]  z-b-3


In [2]:
import pandas as pd
import json

# Load the JSON data from the file
with open('data/gh_repos.json', 'r') as file:
    gh_repos = json.load(file)

# Create a DataFrame with a column 'json' that contains the JSON data
repos = pd.DataFrame({'json': gh_repos})

# Display the DataFrame
print(repos)

                                                json
0  [{'id': 61160198, 'name': 'after', 'full_name'...
1  [{'id': 14756210, 'name': '2013-11_sfu', 'full...
2  [{'id': 41645119, 'name': 'advdatasci', 'full_...
3  [{'id': 56019902, 'name': '2016-14', 'full_nam...
4  [{'id': 17120350, 'name': 'ampolcourse', 'full...
5  [{'id': 57878579, 'name': 'aqi_pdf', 'full_nam...


In [11]:
import pandas as pd
import json

# Load the JSON file into a Python list of dictionaries
with open('data/gh_repos.json') as file:
    gh_repos = json.load(file)

# Convert the list of dictionaries into a pandas DataFrame
repos_df = pd.DataFrame(gh_repos)

# Show the DataFrame structure to understand how to unnest
repos_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,"{'id': 61160198, 'name': 'after', 'full_name':...","{'id': 40500181, 'name': 'argufy', 'full_name'...","{'id': 36442442, 'name': 'ask', 'full_name': '...","{'id': 34924886, 'name': 'baseimports', 'full_...","{'id': 61620661, 'name': 'citest', 'full_name'...","{'id': 33907457, 'name': 'clisymbols', 'full_n...","{'id': 37236467, 'name': 'cmaker', 'full_name'...","{'id': 67959624, 'name': 'cmark', 'full_name':...","{'id': 63152619, 'name': 'conditions', 'full_n...","{'id': 24343686, 'name': 'crayon', 'full_name'...",...,"{'id': 36510350, 'name': 'keypress', 'full_nam...","{'id': 68243980, 'name': 'lintr', 'full_name':...","{'id': 30056165, 'name': 'macBriain', 'full_na...","{'id': 40200563, 'name': 'maxygen', 'full_name...","{'id': 29020316, 'name': 'MISO', 'full_name': ...","{'id': 52725247, 'name': 'parr', 'full_name': ...","{'id': 23134624, 'name': 'parsedate', 'full_na...","{'id': 24293304, 'name': 'pingr', 'full_name':...","{'id': 23767512, 'name': 'pkgconfig', 'full_na...","{'id': 11191345, 'name': 'playground', 'full_n..."
1,"{'id': 14756210, 'name': '2013-11_sfu', 'full_...","{'id': 14152301, 'name': '2014-01-27-miami', '...","{'id': 18495266, 'name': '2014-05-12-ubc', 'fu...","{'id': 31132213, 'name': '2015-02-23_bryan-fie...","{'id': 38192453, 'name': '2015-06-28_r-summit-...","{'id': 40545399, 'name': '2015-08_bryan-jsm-st...","{'id': 65086873, 'name': '2015_Coartic', 'full...","{'id': 58010746, 'name': '2016-06_spreadsheets...","{'id': 63552154, 'name': '2016-07_data-carpent...","{'id': 15151129, 'name': '545A_hw06', 'full_na...",...,"{'id': 62916989, 'name': 'bookdown', 'full_nam...","{'id': 10124989, 'name': 'boot-camps', 'full_n...","{'id': 45448660, 'name': 'candy', 'full_name':...","{'id': 27112806, 'name': 'CoffeeCoop', 'full_n...","{'id': 22161372, 'name': 'datacarpentry', 'ful...","{'id': 50471649, 'name': 'ddpcr', 'full_name':...","{'id': 40727345, 'name': 'devtools', 'full_nam...","{'id': 46666194, 'name': 'diffr', 'full_name':...","{'id': 45375376, 'name': 'dplyr', 'full_name':...","{'id': 61313360, 'name': 'eigencoder', 'full_n..."
2,"{'id': 41645119, 'name': 'advdatasci', 'full_n...","{'id': 47568815, 'name': 'advdatasci-swirl', '...","{'id': 65922328, 'name': 'advdatasci16', 'full...","{'id': 66415014, 'name': 'advdatasci_swirl', '...","{'id': 12441219, 'name': 'ballgown', 'full_nam...","{'id': 20234724, 'name': 'capitalIn21stCentury...","{'id': 36437287, 'name': 'careerplanning', 'fu...","{'id': 7751816, 'name': 'dataanalysis', 'full_...","{'id': 4772877, 'name': 'datascientist', 'full...","{'id': 14204342, 'name': 'datasharing', 'full_...",...,"{'id': 12563551, 'name': 'googleCite', 'full_n...","{'id': 6582536, 'name': 'graduate', 'full_name...","{'id': 6661008, 'name': 'healthvis', 'full_nam...","{'id': 19133476, 'name': 'hyde', 'full_name': ...","{'id': 16584923, 'name': 'inclassfeb62014', 'f...","{'id': 7745123, 'name': 'jhsph753', 'full_name...","{'id': 15639612, 'name': 'jhsph753and4', 'full...","{'id': 42834789, 'name': 'jhudash', 'full_name...","{'id': 42873969, 'name': 'jhudash-refugee', 'f...","{'id': 19133794, 'name': 'jtleek.github.io', '..."
3,"{'id': 56019902, 'name': '2016-14', 'full_name...","{'id': 50363731, 'name': 'choroplethrCaCensusT...","{'id': 50631926, 'name': 'choroplethrUTCensusT...","{'id': 47999571, 'name': 'CountyHealthApp', 'f...","{'id': 62814408, 'name': 'data-police-shooting...","{'id': 40152050, 'name': 'ExData_Plotting1', '...","{'id': 66172034, 'name': 'fall2016competition'...","{'id': 69769188, 'name': 'ggthemes', 'full_nam...","{'id': 39162163, 'name': 'human_activity_smart...","{'id': 53640070, 'name': 'janeaustenr', 'full_...",...,"{'id': 64248946, 'name': 'r-travis', 'full_nam...","{'id': 42529303, 'name': 'RepData_PeerAssessme...","{'id': 50752693, 'name': 'SLCWaterMapping', 'f...","{'id': 55175084, 'name': 'tidytext', 'full_nam...","{'id': 54402509, 'name': 'unconf16', 'full_nam...","{'id': 46437841, 'name': 'WeightLiftingMachine...",,,,
4,"{'id': 17120350, 'name': 'ampolcourse', 'full_...","{'id': 32517704, 'name': 'apsa-leeper.bst', 'f...","{'id': 37484170, 'name': 'arco', 'full_name': ...","{'id': 58914380, 'name': 'astrojs', 'full_name...","{'id': 44051488, 'name': 'batman', 'full_name'...","{'id': 50262277, 'name': 'choco-r-devel', 'ful...","{'id': 50262595, 'name': 'choco-rtools', 'full...","{'id': 37645044, 'name': 'ciplotm', 'full_name...","{'id': 16218963, 'name': 'colourlovers', 'full...","{'id': 69040989, 'name': 'conflictcourse', 'fu...",...,"{'id': 39673558, 'name': 'drat', 'full_name': ...","{'id': 16486298, 'name': 'dvn', 'full_name': '...","{'id': 69484625, 'name': 'effect-heterogeneity...","{'id': 12600801, 'name': 'expcourse', 'full_na...","{'id': 69040960, 'name': 'exppolcourse', 'full...","{'id': 10626943, 'name': 'expResults', 'full_n...","{'id': 59699871, 'name': 'GK2011', 'full_name'...","{'id': 59957482, 'name': 'GREA', 'full_name': ...","{'id': 50747949, 'name': 'hints', 'full_name':...","{'id': 11063005, 'name': 'Impressive', 'full_n..."


In [12]:
# It appears each column contains a separate dictionary. 
# The goal is to have each repository as a separate row instead of separate columns.

# Flatten the list of lists of dicts into a single list of dicts
flattened_data = [repo for sublist in gh_repos for repo in sublist]

# Convert the flattened list of dictionaries into a pandas DataFrame
flat_repos_df = pd.DataFrame(flattened_data)

# Check the new DataFrame structure
flat_repos_df.head()

Unnamed: 0,id,name,full_name,owner,private,html_url,description,fork,url,forks_url,...,has_downloads,has_wiki,has_pages,forks_count,mirror_url,open_issues_count,forks,open_issues,watchers,default_branch
0,61160198,after,gaborcsardi/after,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/after,Run Code in the Background,False,https://api.github.com/repos/gaborcsardi/after,https://api.github.com/repos/gaborcsardi/after...,...,True,True,False,0,,0,0,0,5,master
1,40500181,argufy,gaborcsardi/argufy,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/argufy,Declarative function argument checks,False,https://api.github.com/repos/gaborcsardi/argufy,https://api.github.com/repos/gaborcsardi/arguf...,...,True,True,False,1,,6,1,6,19,master
2,36442442,ask,gaborcsardi/ask,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/ask,Friendly CLI interaction in R,False,https://api.github.com/repos/gaborcsardi/ask,https://api.github.com/repos/gaborcsardi/ask/f...,...,True,True,False,0,,4,0,4,5,master
3,34924886,baseimports,gaborcsardi/baseimports,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/baseimports,Do we get warnings for undeclared imports from...,False,https://api.github.com/repos/gaborcsardi/basei...,https://api.github.com/repos/gaborcsardi/basei...,...,True,True,False,0,,0,0,0,0,master
4,61620661,citest,gaborcsardi/citest,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/citest,Test R package and repo for the r-hub CI,True,https://api.github.com/repos/gaborcsardi/citest,https://api.github.com/repos/gaborcsardi/cites...,...,True,True,False,0,,0,0,0,0,master


In [13]:
# Normalize the nested 'owner' field into separate columns
repos_df_expanded = pd.json_normalize(flat_repos_df.to_dict(orient='records'))

# Drop the original 'owner' column as it's now redundant
repos_df_expanded.drop(columns=['owner'], inplace=True)


KeyError: "['owner'] not found in axis"

In [15]:
# Let's start the conversion process from the beginning
# We will load the JSON, create a DataFrame equivalent to an R tibble, and flatten it accordingly

# Reload the JSON file into a Python list of dictionaries
with open('data/gh_repos.json') as file:
    gh_repos = json.load(file)

# The R code creates a tibble with a column named 'json' that contains the raw JSON data.
# In Python with pandas, we would typically convert this directly into a DataFrame.

# Check if the JSON is a list of lists or a list of dicts
if all(isinstance(item, dict) for item in gh_repos):
    # If it's a list of dicts, we can directly create a DataFrame
    repos_df = pd.DataFrame(gh_repos)
else:
    # If it's a list of lists, we need to flatten it first
    flattened_data = [repo for sublist in gh_repos for repo in sublist]
    repos_df = pd.DataFrame(flattened_data)

# Now we have a DataFrame where each repository is a row and each field is a column
repos_df.head()


Unnamed: 0,id,name,full_name,owner,private,html_url,description,fork,url,forks_url,...,has_downloads,has_wiki,has_pages,forks_count,mirror_url,open_issues_count,forks,open_issues,watchers,default_branch
0,61160198,after,gaborcsardi/after,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/after,Run Code in the Background,False,https://api.github.com/repos/gaborcsardi/after,https://api.github.com/repos/gaborcsardi/after...,...,True,True,False,0,,0,0,0,5,master
1,40500181,argufy,gaborcsardi/argufy,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/argufy,Declarative function argument checks,False,https://api.github.com/repos/gaborcsardi/argufy,https://api.github.com/repos/gaborcsardi/arguf...,...,True,True,False,1,,6,1,6,19,master
2,36442442,ask,gaborcsardi/ask,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/ask,Friendly CLI interaction in R,False,https://api.github.com/repos/gaborcsardi/ask,https://api.github.com/repos/gaborcsardi/ask/f...,...,True,True,False,0,,4,0,4,5,master
3,34924886,baseimports,gaborcsardi/baseimports,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/baseimports,Do we get warnings for undeclared imports from...,False,https://api.github.com/repos/gaborcsardi/basei...,https://api.github.com/repos/gaborcsardi/basei...,...,True,True,False,0,,0,0,0,0,master
4,61620661,citest,gaborcsardi/citest,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/citest,Test R package and repo for the r-hub CI,True,https://api.github.com/repos/gaborcsardi/citest,https://api.github.com/repos/gaborcsardi/cites...,...,True,True,False,0,,0,0,0,0,master


In [16]:
# The R function `unnest_longer()` is used to transform list-columns in a tibble to longer format.
list_columns = [col for col in repos_df.columns if repos_df[col].apply(lambda x: isinstance(x, list)).any()]

# If there are list-columns, we will expand the first one found into rows
if list_columns:
    # Expanding the first list-column found
    list_column_to_expand = list_columns[0]
    expanded_df = repos_df.explode(list_column_to_expand)
else:
    # If there are no list-columns, no action is needed
    expanded_df = repos_df

# Display the DataFrame after expansion to check the result
expanded_df.head()

Unnamed: 0,id,name,full_name,owner,private,html_url,description,fork,url,forks_url,...,has_downloads,has_wiki,has_pages,forks_count,mirror_url,open_issues_count,forks,open_issues,watchers,default_branch
0,61160198,after,gaborcsardi/after,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/after,Run Code in the Background,False,https://api.github.com/repos/gaborcsardi/after,https://api.github.com/repos/gaborcsardi/after...,...,True,True,False,0,,0,0,0,5,master
1,40500181,argufy,gaborcsardi/argufy,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/argufy,Declarative function argument checks,False,https://api.github.com/repos/gaborcsardi/argufy,https://api.github.com/repos/gaborcsardi/arguf...,...,True,True,False,1,,6,1,6,19,master
2,36442442,ask,gaborcsardi/ask,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/ask,Friendly CLI interaction in R,False,https://api.github.com/repos/gaborcsardi/ask,https://api.github.com/repos/gaborcsardi/ask/f...,...,True,True,False,0,,4,0,4,5,master
3,34924886,baseimports,gaborcsardi/baseimports,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/baseimports,Do we get warnings for undeclared imports from...,False,https://api.github.com/repos/gaborcsardi/basei...,https://api.github.com/repos/gaborcsardi/basei...,...,True,True,False,0,,0,0,0,0,master
4,61620661,citest,gaborcsardi/citest,"{'login': 'gaborcsardi', 'id': 660288, 'avatar...",False,https://github.com/gaborcsardi/citest,Test R package and repo for the r-hub CI,True,https://api.github.com/repos/gaborcsardi/citest,https://api.github.com/repos/gaborcsardi/cites...,...,True,True,False,0,,0,0,0,0,master


In [17]:
# In the previous step, there were no list-columns to expand with `explode`, which is the pandas equivalent of `unnest_longer()`.
# However, if we were to follow the intent of the R code to unnest any possible nested structures (which `unnest_wider()` would do in R),

repos_df_expanded = pd.json_normalize(repos_df.to_dict(orient='records'))

# Now let's display the first few rows to check the expanded DataFrame structure
repos_df_expanded.head()

Unnamed: 0,id,name,full_name,private,html_url,description,fork,url,forks_url,keys_url,...,owner.following_url,owner.gists_url,owner.starred_url,owner.subscriptions_url,owner.organizations_url,owner.repos_url,owner.events_url,owner.received_events_url,owner.type,owner.site_admin
0,61160198,after,gaborcsardi/after,False,https://github.com/gaborcsardi/after,Run Code in the Background,False,https://api.github.com/repos/gaborcsardi/after,https://api.github.com/repos/gaborcsardi/after...,https://api.github.com/repos/gaborcsardi/after...,...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
1,40500181,argufy,gaborcsardi/argufy,False,https://github.com/gaborcsardi/argufy,Declarative function argument checks,False,https://api.github.com/repos/gaborcsardi/argufy,https://api.github.com/repos/gaborcsardi/arguf...,https://api.github.com/repos/gaborcsardi/arguf...,...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
2,36442442,ask,gaborcsardi/ask,False,https://github.com/gaborcsardi/ask,Friendly CLI interaction in R,False,https://api.github.com/repos/gaborcsardi/ask,https://api.github.com/repos/gaborcsardi/ask/f...,https://api.github.com/repos/gaborcsardi/ask/k...,...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
3,34924886,baseimports,gaborcsardi/baseimports,False,https://github.com/gaborcsardi/baseimports,Do we get warnings for undeclared imports from...,False,https://api.github.com/repos/gaborcsardi/basei...,https://api.github.com/repos/gaborcsardi/basei...,https://api.github.com/repos/gaborcsardi/basei...,...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
4,61620661,citest,gaborcsardi/citest,False,https://github.com/gaborcsardi/citest,Test R package and repo for the r-hub CI,True,https://api.github.com/repos/gaborcsardi/citest,https://api.github.com/repos/gaborcsardi/cites...,https://api.github.com/repos/gaborcsardi/cites...,...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False


In [18]:
# The equivalent of R's names() function in pandas is to get the column names of the DataFrame.
# The equivalent of R's head(10) is to get the first 10 items of an iterable.
# We will get the first 10 column names of the expanded DataFrame.

first_10_column_names = repos_df_expanded.columns.tolist()[:10]
first_10_column_names


['id',
 'name',
 'full_name',
 'private',
 'html_url',
 'description',
 'fork',
 'url',
 'forks_url',
 'keys_url']

In [19]:
# To select specific columns from the DataFrame, we use the indexing method in pandas.
# This is equivalent to the `select()` function in R.

# Selecting the specified columns from the expanded DataFrame
selected_columns_df = repos_df_expanded[['id', 'full_name', 'owner.login', 'description']]

# Displaying the selected columns to verify
selected_columns_df.head()


Unnamed: 0,id,full_name,owner.login,description
0,61160198,gaborcsardi/after,gaborcsardi,Run Code in the Background
1,40500181,gaborcsardi/argufy,gaborcsardi,Declarative function argument checks
2,36442442,gaborcsardi/ask,gaborcsardi,Friendly CLI interaction in R
3,34924886,gaborcsardi/baseimports,gaborcsardi,Do we get warnings for undeclared imports from...
4,61620661,gaborcsardi/citest,gaborcsardi,Test R package and repo for the r-hub CI


In [20]:
# We will build a list of column names that start with 'owner.' to select all owner-related columns
owner_columns = [col for col in repos_df_expanded.columns if col.startswith('owner.')]

# Selecting the specified columns along with all owner-related columns from the expanded DataFrame
selected_columns_with_owner_df = repos_df_expanded[['id', 'full_name', 'description'] + owner_columns]

# Displaying the selected columns to verify
selected_columns_with_owner_df.head()

Unnamed: 0,id,full_name,description,owner.login,owner.id,owner.avatar_url,owner.gravatar_id,owner.url,owner.html_url,owner.followers_url,owner.following_url,owner.gists_url,owner.starred_url,owner.subscriptions_url,owner.organizations_url,owner.repos_url,owner.events_url,owner.received_events_url,owner.type,owner.site_admin
0,61160198,gaborcsardi/after,Run Code in the Background,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
1,40500181,gaborcsardi/argufy,Declarative function argument checks,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
2,36442442,gaborcsardi/ask,Friendly CLI interaction in R,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
3,34924886,gaborcsardi/baseimports,Do we get warnings for undeclared imports from...,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
4,61620661,gaborcsardi/citest,Test R package and repo for the r-hub CI,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False


In [21]:
# The 'json_normalize' function already uses a separator for nested fields which is by default '.',
# but we can specify a different separator if needed.
# Since we want the separator to be '_', we'll need to redefine the DataFrame with the correct separator.

# Redefining the DataFrame with the specified separator '_'
repos_df_expanded_with_underscore = pd.json_normalize(repos_df.to_dict(orient='records'), sep='_')

# Selecting the specified columns from the expanded DataFrame with the new separator for 'owner' fields
selected_columns_with_underscore_df = repos_df_expanded_with_underscore[
    ['id', 'full_name', 'description'] + [col for col in repos_df_expanded_with_underscore.columns if col.startswith('owner_')]
]

# Displaying the selected columns to verify
selected_columns_with_underscore_df.head()


Unnamed: 0,id,full_name,description,owner_login,owner_id,owner_avatar_url,owner_gravatar_id,owner_url,owner_html_url,owner_followers_url,owner_following_url,owner_gists_url,owner_starred_url,owner_subscriptions_url,owner_organizations_url,owner_repos_url,owner_events_url,owner_received_events_url,owner_type,owner_site_admin
0,61160198,gaborcsardi/after,Run Code in the Background,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
1,40500181,gaborcsardi/argufy,Declarative function argument checks,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
2,36442442,gaborcsardi/ask,Friendly CLI interaction in R,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
3,34924886,gaborcsardi/baseimports,Do we get warnings for undeclared imports from...,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False
4,61620661,gaborcsardi/citest,Test R package and repo for the r-hub CI,gaborcsardi,660288,https://avatars.githubusercontent.com/u/660288...,,https://api.github.com/users/gaborcsardi,https://github.com/gaborcsardi,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/follo...,https://api.github.com/users/gaborcsardi/gists...,https://api.github.com/users/gaborcsardi/starr...,https://api.github.com/users/gaborcsardi/subsc...,https://api.github.com/users/gaborcsardi/orgs,https://api.github.com/users/gaborcsardi/repos,https://api.github.com/users/gaborcsardi/event...,https://api.github.com/users/gaborcsardi/recei...,User,False


In [25]:
# Load the JSON file into a pandas DataFrame
chars_path = 'data/got_chars.json'
chars = pd.read_json(chars_path)

# Since in R the tibble is 30 by 1, it suggests that the JSON data is a list of 30 items
# We need to ensure that the DataFrame in Python also reflects this structure
# Each item in the list should be a row in the DataFrame, and there should be only one column

# If the JSON is a list of dicts, pd.read_json should handle this correctly by default
# However, if the JSON is nested deeper or in a different format, we may need to adjust the approach
# First, let's check the shape of the DataFrame to see if it matches the expected 30x1 structure
chars_shape = chars.shape
chars_head = chars.head()  # Display the first few rows for inspection

chars_shape, chars_head


((30, 18),
                                                  url    id               name  \
 0  https://www.anapioficeandfire.com/api/characte...  1022      Theon Greyjoy   
 1  https://www.anapioficeandfire.com/api/characte...  1052   Tyrion Lannister   
 2  https://www.anapioficeandfire.com/api/characte...  1074  Victarion Greyjoy   
 3  https://www.anapioficeandfire.com/api/characte...  1109               Will   
 4  https://www.anapioficeandfire.com/api/characte...  1166         Areo Hotah   
 
   gender   culture                            born  \
 0   Male  Ironborn    In 278 AC or 279 AC, at Pyke   
 1   Male               In 273 AC, at Casterly Rock   
 2   Male  Ironborn    In 268 AC or before, at Pyke   
 3   Male                                             
 4   Male  Norvoshi  In 257 AC or before, at Norvos   
 
                            died  alive  \
 0                                 True   
 1                                 True   
 2                                

In [26]:
# Create a new DataFrame with a single column 'json' that contains the JSON representation of each character
chars_single_col = pd.DataFrame({
    'json': chars.apply(lambda x: x.to_json(), axis=1)
})

# Verify the new structure
chars_single_col_shape = chars_single_col.shape
chars_single_col_head = chars_single_col.head()  # Display the first few rows for inspection

chars_single_col_shape, chars_single_col_head

((30, 1),
                                                 json
 0  {"url":"https:\/\/www.anapioficeandfire.com\/a...
 1  {"url":"https:\/\/www.anapioficeandfire.com\/a...
 2  {"url":"https:\/\/www.anapioficeandfire.com\/a...
 3  {"url":"https:\/\/www.anapioficeandfire.com\/a...
 4  {"url":"https:\/\/www.anapioficeandfire.com\/a...)

In [27]:
# To unnest the 'json' column into a wider format, we'll need to convert the JSON strings back to dictionaries
# and then use json_normalize to create a DataFrame with each key in the JSON object as a column.

# Import the json module to parse JSON strings
import json

# Convert the JSON strings in the 'json' column back to dictionaries
chars_dicts = chars_single_col['json'].apply(json.loads)

# Use json_normalize to create a wider DataFrame
chars_wider = pd.json_normalize(chars_dicts)

# Verify the result
chars_wider_shape = chars_wider.shape
chars_wider_head = chars_wider.head()  # Display the first few rows for inspection

chars_wider_shape, chars_wider_head

((30, 18),
                                                  url    id               name  \
 0  https://www.anapioficeandfire.com/api/characte...  1022      Theon Greyjoy   
 1  https://www.anapioficeandfire.com/api/characte...  1052   Tyrion Lannister   
 2  https://www.anapioficeandfire.com/api/characte...  1074  Victarion Greyjoy   
 3  https://www.anapioficeandfire.com/api/characte...  1109               Will   
 4  https://www.anapioficeandfire.com/api/characte...  1166         Areo Hotah   
 
   gender   culture                            born  \
 0   Male  Ironborn    In 278 AC or 279 AC, at Pyke   
 1   Male               In 273 AC, at Casterly Rock   
 2   Male  Ironborn    In 268 AC or before, at Pyke   
 3   Male                                             
 4   Male  Norvoshi  In 257 AC or before, at Norvos   
 
                            died  alive  \
 0                                 True   
 1                                 True   
 2                                

In [28]:
# Using the chars_wider DataFrame, we can select the specified columns: 'id', 'name', 'gender', 'culture', 'born', 'died', and 'alive'.
# This is done using the DataFrame's indexing method with a list of column names.

characters = chars_wider[['id', 'name', 'gender', 'culture', 'born', 'died', 'alive']]

# Verify the result
characters_shape = characters.shape
characters_head = characters.head()  # Display the first few rows for inspection

characters_shape, characters_head


((30, 7),
      id               name gender   culture                            born  \
 0  1022      Theon Greyjoy   Male  Ironborn    In 278 AC or 279 AC, at Pyke   
 1  1052   Tyrion Lannister   Male               In 273 AC, at Casterly Rock   
 2  1074  Victarion Greyjoy   Male  Ironborn    In 268 AC or before, at Pyke   
 3  1109               Will   Male                                             
 4  1166         Areo Hotah   Male  Norvoshi  In 257 AC or before, at Norvos   
 
                            died  alive  
 0                                 True  
 1                                 True  
 2                                 True  
 3  In 297 AC, at Haunted Forest  False  
 4                                 True  )

In [29]:
characters

Unnamed: 0,id,name,gender,culture,born,died,alive
0,1022,Theon Greyjoy,Male,Ironborn,"In 278 AC or 279 AC, at Pyke",,True
1,1052,Tyrion Lannister,Male,,"In 273 AC, at Casterly Rock",,True
2,1074,Victarion Greyjoy,Male,Ironborn,"In 268 AC or before, at Pyke",,True
3,1109,Will,Male,,,"In 297 AC, at Haunted Forest",False
4,1166,Areo Hotah,Male,Norvoshi,"In 257 AC or before, at Norvos",,True
5,1267,Chett,Male,,At Hag's Mire,"In 299 AC, at Fist of the First Men",False
6,1295,Cressen,Male,,In 219 AC or 220 AC,"In 299 AC, at Dragonstone",False
7,130,Arianne Martell,Female,Dornish,"In 276 AC, at Sunspear",,True
8,1303,Daenerys Targaryen,Female,Valyrian,"In 284 AC, at Dragonstone",,True
9,1319,Davos Seaworth,Male,Westeros,"In 260 AC or before, at King's Landing",,True


In [31]:
# We need to find columns where the data type is 'object' which can contain lists
# and then check if any cell in the column is actually a list.

# Select columns with 'object' dtype which can potentially contain lists
object_columns = chars_wider.select_dtypes(include=['object'])

# Now filter these columns to find which ones contain lists
columns_with_lists = object_columns.applymap(lambda x: isinstance(x, list)).any()

# Create a list of column names that contain lists
list_column_names = columns_with_lists[columns_with_lists].index.tolist()

# Now select the 'id' column and the columns that contain lists
characters_with_list_data = chars_wider[['id'] + list_column_names]

# Verify the result
characters_with_list_data_shape = characters_with_list_data.shape
characters_with_list_data_head = characters_with_list_data.head()  # Display the first few rows for inspection

characters_with_list_data_shape, characters_with_list_data_head

((30, 8),
      id                                             titles  \
 0  1022  [Prince of Winterfell, Lord of the Iron Island...   
 1  1052  [Acting Hand of the King (former), Master of C...   
 2  1074  [Lord Captain of the Iron Fleet, Master of the...   
 3  1109                                                      
 4  1166                   Captain of the Guard at Sunspear   
 
                                              aliases  \
 0  [Prince of Fools, Theon Turncloak, Reek, Theon...   
 1  [The Imp, Halfman, The boyman, Giant of Lannis...   
 2                                   The Iron Captain   
 3                                                      
 4                                                      
 
                          allegiances  \
 0              House Greyjoy of Pyke   
 1   House Lannister of Casterly Rock   
 2              House Greyjoy of Pyke   
 3                                 []   
 4  House Nymeros Martell of Sunspear   
 
                  

In [32]:
# To unnest the 'titles' column into a longer format, where each title has its own row,
# we will use the DataFrame's explode method. This method transforms each element of a list-like
# into a row, replicating the index values.

# First, select the 'id' and 'titles' columns from chars_wider DataFrame
chars_id_titles = chars_wider[['id', 'titles']]

# Now, use explode on the 'titles' column
chars_longer_titles = chars_id_titles.explode('titles')

# Verify the result
chars_longer_titles_shape = chars_longer_titles.shape
chars_longer_titles_head = chars_longer_titles.head()  # Display the first few rows for inspection

chars_longer_titles_shape, chars_longer_titles_head

((59, 2),
      id                                             titles
 0  1022                               Prince of Winterfell
 0  1022  Lord of the Iron Islands (by law of the green ...
 1  1052                   Acting Hand of the King (former)
 1  1052                            Master of Coin (former)
 2  1074                     Lord Captain of the Iron Fleet)

In [33]:
# To filter out rows where 'titles' is an empty string, we will use the DataFrame's query method.
# After that, we will rename the 'titles' column to 'title'.

# Filter out the rows where 'titles' column is an empty string
chars_titles_filtered = chars_longer_titles.query("titles != ''")

# Rename the 'titles' column to 'title'
chars_titles_filtered = chars_titles_filtered.rename(columns={'titles': 'title'})

# Verify the result
titles_shape = chars_titles_filtered.shape
titles_head = chars_titles_filtered.head()  # Display the first few rows for inspection

titles_shape, titles_head

((52, 2),
      id                                              title
 0  1022                               Prince of Winterfell
 0  1022  Lord of the Iron Islands (by law of the green ...
 1  1052                   Acting Hand of the King (former)
 1  1052                            Master of Coin (former)
 2  1074                     Lord Captain of the Iron Fleet)

In [36]:
# First, let's recreate the dataframe from the 'got_chars' json, similar to the earlier steps.
# Then, we will perform the equivalent operations in Python as described in the R code.

# Assuming 'got_chars' is a JSON string or a path to a .json file containing the Game of Thrones characters data.

# Read the JSON data into a DataFrame
got_chars_df = pd.read_json('data/got_chars.json')

# Filter out the columns where the data type is list
is_list = lambda x: isinstance(x, list)
list_columns = got_chars_df.applymap(is_list).any()
list_column_names = list_columns[list_columns].index.tolist()

# Select the 'id' column and the columns that contain lists
df_with_lists = got_chars_df[['id'] + list_column_names]

# Pivot longer, where each list becomes a row, and the list's original column name becomes a value in the 'name' column
df_pivot_longer = df_with_lists.melt(id_vars=['id'], value_vars=list_column_names, var_name='name', value_name='value')

# Unnest the 'value' column which contains lists into longer format
df_unnest_longer = df_pivot_longer.explode('value')

df_unnest_longer.head()

Unnamed: 0,id,name,value
0,1022,titles,Prince of Winterfell
0,1022,titles,Lord of the Iron Islands (by law of the green ...
1,1052,titles,Acting Hand of the King (former)
1,1052,titles,Master of Coin (former)
2,1074,titles,Lord Captain of the Iron Fleet


In [34]:
import json
import pandas as pd

# Parsing JSON strings and printing their data types using Python's json.loads() and type()
parsed_json_1 = json.loads('1')
parsed_json_array = json.loads('[1, 2, 3]')
parsed_json_object = json.loads('{"x": [1, 2, 3]}')

# Creating a DataFrame from a JSON string that represents an array of objects
json_array = '[{"name": "John", "age": 34}, {"name": "Susan", "age": 27}]'
df_array = pd.DataFrame(json.loads(json_array))

# Unnesting a wider DataFrame directly from the parsed JSON
df_array_wider = pd.json_normalize(json.loads(json_array))

# Creating a DataFrame from a JSON string that represents an object containing an array of objects
json_object = '''
{
  "status": "OK",
  "results": [
    {"name": "John", "age": 34},
    {"name": "Susan", "age": 27}
  ]
}
'''
df_object = pd.DataFrame({'json': [json.loads(json_object)]})

# Unnesting the 'results' field from the JSON object into a DataFrame, then unnesting it longer and wider
df_results_longer_wider = pd.json_normalize(json.loads(json_object), 'results')

# Creating a DataFrame directly from the 'results' field in the parsed JSON object
df_results = pd.json_normalize(json.loads(json_object), 'results')

# Creating DataFrames from JSON strings that represent an object and an array of objects, respectively
json_col = '''
{
  "x": ["a", "x", "z"],
  "y": [10, null, 3]
}
'''
json_row = '''
[
  {"x": "a", "y": 10},
  {"x": "x", "y": null},
  {"x": "z", "y": 3}
]
'''
df_col = pd.DataFrame({'json': [json.loads(json_col)]})
df_row = pd.DataFrame(json.loads(json_row))

# Display the types of the parsed JSON and the DataFrames created
(parsed_json_1, type(parsed_json_1)), (parsed_json_array, type(parsed_json_array)), (parsed_json_object, type(parsed_json_object)), df_array, df_array_wider, df_results_longer_wider, df_col, df_row


((1, int),
 ([1, 2, 3], list),
 ({'x': [1, 2, 3]}, dict),
     name  age
 0   John   34
 1  Susan   27,
     name  age
 0   John   34
 1  Susan   27,
     name  age
 0   John   34
 1  Susan   27,
                                          json
 0  {'x': ['a', 'x', 'z'], 'y': [10, None, 3]},
    x     y
 0  a  10.0
 1  x   NaN
 2  z   3.0)

In [38]:
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri
from rpy2.robjects.packages import importr

# Activate the automatic conversion of rpy2 objects to pandas objects
pandas2ri.activate()

# Import the R package
repurrrsive = importr('repurrrsive')

# Load the gmaps_cities dataset
gmaps_cities_r = ro.r('gmaps_cities')

# Convert to a pandas DataFrame
gmaps_cities_df = pandas2ri.ri2py(gmaps_cities_r)


AttributeError: module 'rpy2.robjects.pandas2ri' has no attribute 'ri2py'

In [40]:
import rpy2.robjects as ro
from rpy2.robjects import conversion

# Load the R data
gmaps_cities_r = ro.r('gmaps_cities')

# Initialize an empty list to hold the converted Python dictionaries
python_list = []

# Convert each element of the R list to a Python dictionary
for i in range(len(gmaps_cities_r)):
    with conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
        python_dict = conversion.rpy2py(gmaps_cities_r[i])
        python_list.append(python_dict)

# At this point, python_list is a list of Python dictionaries
# You can now create a pandas DataFrame from it
gmaps_cities_df = pd.DataFrame(python_list)



TypeError: 'NULLType' object is not iterable

In [42]:
import rpy2.robjects as ro
from rpy2.robjects import conversion
from rpy2.rinterface import NULL

# Load the R data
gmaps_cities_r = ro.r('gmaps_cities')

# Initialize an empty list to hold the converted Python objects
python_list = []

# Convert each non-NULL element of the R list to a Python object
for i in range(len(gmaps_cities_r)):
    # Skip over NULL values in the R list
    if gmaps_cities_r[i] is NULL:
        continue

    # For non-NULL values, perform the conversion
    with conversion.localconverter(ro.default_converter + ro.pandas2ri.converter):
        python_obj = conversion.rpy2py(gmaps_cities_r[i])
        python_list.append(python_obj)

# Now, python_list is a list of Python objects, which you can convert to a pandas DataFrame
# You may need additional processing depending on the structure of the objects


TypeError: 'NULLType' object is not iterable