
# Big Data Best Practices - Complete Project Notebook
### **by JeffTheDeveloper**  
##### *Inspired by Néstor Nicolás Campos Rojas' methodologies*



<br>
## 1. Project Setup & File Structure
# Creates the complete folder structure and generates sample datasets


In [None]:
%pip install -r requirements.txt

# If doesn't work try this one below:

In [None]:
# Instalação de todas as dependências em uma única célula
!pip install pandas==1.5.0 numpy==1.21.0 jupyter==1.0.0 ipython==8.0.0 pyarrow==8.0.0 tqdm==4.65.0 python-dotenv==0.21.0
!pip install jupyter-contrib-nbextensions==0.5.1 jupyter-nbextensions-configurator==0.4.1

# Verificação das versões instaladas
import pandas as pd
import numpy as np
import IPython

print("\n✅ Dependências instaladas com sucesso:")
print(f"pandas: {pd.__version__}")
print(f"numpy: {np.__version__}")
print(f"IPython: {IPython.__version__}")
print("Pronto para executar o projeto Big Data Best Practices!")

In [None]:
# Dependências avançadas (descomente se necessário)
# !pip install pyspark==3.3.0 databricks-connect==10.4 plotly==5.11.0 ipywidgets==8.0.0

**Execute esta célula uma vez no início do notebook**

Se estiver no Databricks, substitua por:



In [3]:

dbutils.library.installPyPI("pandas", version="1.5.0")
dbutils.library.installPyPI("pyarrow") 


NameError: name 'dbutils' is not defined


**Para ambientes com restrições de internet, use:**


In [None]:

%pip install --no-deps pandas numpy  # Ignora dependências secundárias

In [1]:

import os
import json
import zipfile
import pandas as pd
import numpy as np
from IPython.display import HTML, display

# Create folder structure
folders = [
    'big-data-best-practices/databricks_notebooks',
    'big-data-best-practices/sample_data/transactions',
    'big-data-best-practices/docs'
]

for folder in folders:
    os.makedirs(folder, exist_ok=True)
    print(f"Created directory: {folder}")


ModuleNotFoundError: No module named 'pandas'



## 2. Sample Data Generation
# Creates realistic datasets for demonstration


In [None]:


# Transaction data (streaming)
transactions = [
    {"transaction_id": f"T{1000+i}", "amount": np.random.randint(10,500), 
     "currency": np.random.choice(["USD", "BRL", "EUR"]), "timestamp": pd.Timestamp.now().isoformat()}
    for i in range(50)
]

with open('big-data-best-practices/sample_data/transactions/stream_1.json', 'w') as f:
    json.dump(transactions[:25], f)
    
with open('big-data-best-practices/sample_data/transactions/stream_2.json', 'w') as f:
    json.dump(transactions[25:], f)

# Reference data
reference_data = pd.DataFrame({
    "currency": ["USD", "BRL", "EUR"],
    "exchange_rate": [1.0, 5.20, 0.95],
    "risk_level": ["Low", "High", "Medium"]
})
reference_data.to_csv('big-data-best-practices/sample_data/reference_data.csv', index=False)

print("Sample data created:")
display(reference_data.head())




## 3. Notebook Creation
# Generates the Databricks notebooks with best practices


In [None]:

notebooks = {
    "1_Data_Ingestion.ipynb": """{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Ingestion (Best Practices)\\n",
    "From Néstor Campos' Big Data Course"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# AutoLoader configuration\\n",
    "df = (spark.readStream\\n",
    "  .format(\\"cloudFiles\\")\\n",
    "  .option(\\"cloudFiles.format\\", \\"json\\")\\n",
    "  .load(\\"/mnt/sample_data/transactions\\"))"
   ]
  }
 ]
}""",
    "2_Data_Transformation.ipynb": """{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data Transformation\\n",
    "## Currency Normalization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import col, when\\n",
    "\\n",
    "df_normalized = (df\\n",
    "  .join(spark.table(\\"reference_data\\"), \\"currency\\")\\n",
    "  .withColumn(\\"amount_usd\\", col(\\"amount\\") * col(\\"exchange_rate\\")))"
   ]
  }
 ]
}"""
}

for filename, content in notebooks.items():
    path = f"big-data-best-practices/databricks_notebooks/{filename}"
    with open(path, 'w') as f:
        f.write(content)
    print(f"Created notebook: {path}")




## 4. Security Configurations
# Advanced security settings for production environments


In [None]:


#security_config = """# Databricks Security Best Practices



1. Cluster-Level:
   - Enable Table Access Control
   - Use Single User mode for sensitive jobs



2. Data Protection:


In [None]:
spark.conf.set("spark.databricks.delta.preview.enabled", "true")
spark.conf.set("spark.databricks.repl.allowedLanguages", "python,sql")



3. IAM Roles:
   - Minimum privilege principle
   - SCIM provisioning for user management"""


In [None]:

with open('big-data-best-practices/docs/security_configurations.md', 'w') as f:
    f.write(security_config)





## 5. Create Downloadable ZIP
# Packages all project files


In [None]:
def create_zip():
    zip_path = 'big-data-best-practices.zip'
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for root, dirs, files in os.walk('big-data-best-practices'):
            for file in files:
                zipf.write(os.path.join(root, file))
    return zip_path

zip_file = create_zip()
print(f"Project ZIP created: {zip_file}")




## 6. Databricks Setup Walkthrough
# Embedded video guide for Databricks Community setup





<h3>Databricks Setup Video Guide</h3>
<iframe width="560" height="315" src="https://www.youtube.com/embed/6t_rcDU8e5w" 
title="YouTube video player" frameborder="0" allowfullscreen></iframe>

<h4>Step-by-Step:</h4>
<ol>
  <li>Go to <a href="https://community.cloud.databricks.com/" target="_blank">Databricks Community</a></li>
  <li>Sign up with GitHub</li>
  <li>Create cluster (11.3 LTS runtime)</li>
  <li>Import notebooks from ZIP</li>
</ol>





## 7. Key Skills Demonstrated



In [None]:
skills = {
    "Cluster Management": ["Auto-scaling", "Library installation", "IAM integration"],
    "Data Processing": ["Delta Lake", "Streaming", "Koalas optimization"],
    "Security": ["Table ACLs", "Data encryption", "Audit logging"]
}

pd.DataFrame.from_dict(skills, orient='index').transpose()




## 8. Complete Project Execution




<div style="background: #f8f9fa; padding: 20px; border-radius: 5px;">
  <h3>Ready to Execute!</h3>
  <p>After setting up Databricks:</p>
  <pre><code># Run this in your first notebook cell
dbutils.library.installPyPI("koalas")
dbutils.library.restartPython()</code></pre>
  
  <p>Download project files: <a href="./big-data-best-practices.zip" download>big-data-best-practices.zip</a></p>
</div>





## 9. Additional Resources



In [None]:

resources = {
    "Course": ["Big Data Best Practices - Néstor Campos (Coursera)"],
    "Documentation": ["Databricks Docs", "Spark API Reference"],
    "Community": ["Databricks Forum", "Stack Overflow #spark"]
}

pd.DataFrame.from_dict(resources, orient='index').transpose()