In [2]:
import nbformat
import os
import re
from itertools import chain
import ast
import pipdeptree



## Find all packages that are currently being pip or conda installed

In [4]:
def get_required_packages(notebook_path):
    with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
        notebook_content = nbformat.read(notebook_file, as_version=4)

    required_packages = set()

    for cell in notebook_content['cells']:
        if cell['cell_type'] == 'code':
            source_code = cell['source']
            lines = source_code.split('\n')
            for line in lines:
                if 'pip install' in line or 'conda install' in line:
                    # Extracting the package name from the installation command
                    package_name = line.split('install')[1]
                    required_packages.add(package_name.strip())

    return required_packages

def get_notebooks_in_folder(folder_path):
    notebook_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.ipynb'):
                notebook_files.append(os.path.join(root, file))
    return notebook_files

def get_required_packages_from_folder(folder_path):
    all_packages = dict()
    notebook_files = get_notebooks_in_folder(folder_path)

    for notebook_file in notebook_files:
        if '-checkpoint' not in str(notebook_file):
            packages = get_required_packages(notebook_file)
            if len(packages)> 0:
                all_packages[notebook_file] = packages

    return all_packages

chapters_packages = get_required_packages_from_folder('content')

print("Packages being manually installed in notebooks in the 'content' folder:")
for i in chapters_packages.items():
    print(i)

def flatten_chain(matrix):
    return set(chain.from_iterable(matrix))

print("______________________________________________________________________________________________________")
print("\nAll packages being manually installed:")
flatten_chain(chapters_packages.values())

Packages being manually installed in notebooks in the 'content' folder:
______________________________________________________________________________________________________

All packages being manually installed:


set()

## Find all packages that are imported

In [5]:
def get_notebooks_in_folder(folder_path):
    notebook_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.ipynb'):
                notebook_files.append(os.path.join(root, file))
    return notebook_files

def get_imported_packages(notebook_path):
    with open(notebook_path, 'r', encoding='utf-8') as notebook_file:
        notebook_content = nbformat.read(notebook_file, as_version=4)

    required_packages = set()

    for cell in notebook_content['cells']:
        if cell['cell_type'] == 'code':
            source_code = cell['source']
            lines = source_code.split('\n')
            for line in lines:
                if not line.startswith('#') and 'import' in line:
                    if 'from' in line:
                        package_name = line.split('from')[1].split('import')[0].split(' as ')[0].split('.')[0]
                    else:
                        package_name = line.split('import')[1].split(' as ')[0].split('.')[0]
                    required_packages.add(package_name.strip())

    return required_packages

def get_imported_packages_from_folder(folder_path):
    all_imported_packages = dict()
    notebook_files = get_notebooks_in_folder(folder_path)

    for notebook_file in notebook_files:
        if '-checkpoint' not in str(notebook_file):
            packages = get_imported_packages(notebook_file)
            if len(packages)> 0:
                    all_imported_packages[notebook_file] = packages

    return all_imported_packages

chapters_imported_packages = get_imported_packages_from_folder('content')

print("Packages being imported in notebooks in the 'content' folder:")
for i in chapters_imported_packages.items():
    print(i)


def flatten_chain(matrix):
    return set(chain.from_iterable(matrix))

all_packs_imported = flatten_chain(chapters_imported_packages.values())

print("______________________________________________________________________________________________________")
print("\nAll packages being imported:")
for i in all_packs_imported:
    print(i)


Packages being imported in notebooks in the 'content' folder:
('content/05-regex/regex.ipynb', {'regex', 'pandas'})
('content/02-prereqs/numpy.ipynb', {'numpy'})
('content/07-sql/sql.ipynb', {'numpy', 'pathlib', 'sqlalchemy', 'pandas'})
('content/04-visualizations/types.ipynb', {'seaborn', 'matplotlib', 'plotly', 'pandas', 'numpy'})
('content/04-visualizations/advanced.ipynb', {'numpy', 'seaborn', 'pandas', 'matplotlib'})
('content/06-api/api.ipynb', {'urllib', 'requests', 'pandas'})
('content/08-regression/regressions.ipynb', {'statsmodels', 'pandas'})
('content/03-pandas/merge.ipynb', {'pandas'})
('content/03-pandas/groupby_pivot.ipynb', {'numpy', 'pandas'})
('content/03-pandas/loading-looking.ipynb', {'pandas'})
('content/03-pandas/select_modify.ipynb', {'numpy', 'pandas'})
('content/03-pandas/misc_funcs.ipynb', {'datetime', 'numpy', 'pandas'})
______________________________________________________________________________________________________

All packages being imported:
urllib


## Generate streamlined requirements.txt

In [6]:
x = "ydata-profiling" #example pipdeptree for a package
out = !pipdeptree -p $x
out

 '* econtools==0.3.2',
 ' - scipy [required: Any, installed: ?]',
 '------------------------------------------------------------------------',
 'No packages matched using the following patterns: ydata-profiling']

In [7]:
p = re.compile("- (.*) \[.*installed: (.*)]") # regex wooo
m = p.match( '- numpy [required: >=1.18.5,<1.25.0, installed: 1.23.2]' )
m.group(1), m.group(2)

('numpy', '1.23.2')

In [14]:
pck='statsmodels'
out = !pipdeptree -p $pck #get pipdeptree of pck
    
out #= [y.strip() for y in out if (y.lower().startswith(pck)) or ("installed" in y)]

 '* econtools==0.3.2',
 ' - scipy [required: Any, installed: ?]',
 '------------------------------------------------------------------------',
 'No packages matched using the following patterns: statsmodels']

In [15]:
# loop through main packages and collect dependencies in dictionary called pack
# buggy cases go into buggy_pcks
# i have manually removed jl4 and all it's dependencies from buggy cases for now (otherwise they would appear there)
pack = {}
buggy_pcks = {}
for pck in all_packs_imported:
    out = !pipdeptree -p $pck #get pipdeptree of pck
    
    out = [y.strip() for y in out if (y.lower().startswith(pck)) or ("installed" in y)] #take out unrelated text
    
    for depend in out: 
        if (pck+"=") in depend.lower(): #for including the actual pck itself
            package = depend.split("==")[0]
            version = depend.split("==")[1]
            pack[package] = version
            continue
            
        else:
            try:
                m = p.match(depend) #for including the dependencies of pck
                package = m.group(1)
                version = m.group(2)
                if version == '?':
                    version = ''    
                pack[package] = version
            except: #for debugging :)
                if not str(depend).startswith('- jupyterlab') and \
                not str(depend).strip().startswith('└──') and not str(depend).strip().startswith('├──') and \
                not str(depend).strip().startswith('│'):
                    print(depend)
                    if pck not in buggy_pcks:
                        buggy_pcks[pck] = [depend]
                    else:
                        buggy_pcks[pck].append(depend)
pack    
    

{'scipy': '',
 'seaborn': '0.12.2',
 'matplotlib': '3.7.1',
 'plotly': '5.13.0',
 'requests': '2.28.1',
 'SQLAlchemy': '1.4.46',
 'pandas': '1.5.3',
 'regex': '2023.5.5',
 'numpy': '1.25.0'}

In [16]:
buggy_pcks

{}

In [17]:
pk = list(pack.keys()) #list of all packages for requirements.txt
pk.sort() #sorted by capitals first then small letters

In [18]:
for pck in pk: #the actual info in requirements.txt :)
    if pack[pck] != "":
        print(pck + "==" + pack[pck]) 
    else:
        print(pck)

SQLAlchemy==1.4.46
matplotlib==3.7.1
numpy==1.25.0
pandas==1.5.3
plotly==5.13.0
regex==2023.5.5
requests==2.28.1
scipy
seaborn==0.12.2
